Jsoup搜索优化

时间:2018-09-26 15:06:49

标签: java performance optimization jsoup

我创建了一个小的算法,可以从二手车销售广告网站上提取数据,但是我面临算法速度方面的一些问题。如果要提取超过1500个广告,它将花费大约35-40秒即可获得所有值。 这是代码:

public class ScrapperOlxImplementation implements ScrapperOlx {

    private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36";
    // private final String query = "dacia";
    private String lastPage = "";
    private int pageFinalNumber;
    private int index = 0;
    private int k, j, m, l, n;
    private int indexNume = 0;
    private int searchOne, searchTwo, searchThree;
    private String img = "";
    private int indexare;

    @Override
    public List searchOlx(String marca, String model, String pretDeLa, String pretPanaLa, String anFabrDeLa,
            String anFabrPanaLa, String orasParam) {
        Document page = null;
        try {
            page = Jsoup.connect(
                    "link"))
                    .userAgent(USER_AGENT).get();
            try {
                lastPage = page.select(".item.fleft").last().text();
                if (lastPage != null) {
                    pageFinalNumber = Integer.parseInt(lastPage);
                }
            } catch (NullPointerException e) {
                pageFinalNumber = 1;
            }

            List<SearchResultOlx> list = new ArrayList<>();

            for (int i = 1; i <= pageFinalNumber; i++) {
                String iChanging = Integer.toString(i);

                Document page2 = Jsoup.connect(link))
                        .userAgent(USER_AGENT).get();

                Elements pageSearch = page2.select(".lheight22.margintop5");
                Elements pageSearch2 = page2.select(".space.inlblk.rel");
                Elements pageSearch3 = page2.select(".lheight22.margintop5 > a");
                Elements pageSearch4 = page2.select(".offer-wrapper");
                Elements pageSearch5 = page2.select(".breadcrumb.x-normal > span");
                Elements searchResults = page2.select(".offer-wrapper > table > tbody > tr > td[width=150] > a");

                for (j = 0; j < pageSearch.size(); j++) {
                    String title = pageSearch.get(j).text();
                    list.add(new SearchResultOlx(title));
                }

                for (Element searchResult : searchResults) {
                    boolean withoutImage = searchResult.hasClass(".nophoto");
                    if (!withoutImage) {
                        String imgSrc = searchResult.select("img").attr("src");
                        list.get(indexare).setImgLink(imgSrc);
                    } else {
                        list.get(indexare).setImgLink("No photo");
                    }
                    indexare++;

                }
                for (k = 0; k < pageSearch2.size(); k++) {
                    String price = pageSearch2.get(k).text();
                    list.get(searchOne).setPrice(price);
                    searchOne++;
                }

                for (l = 0; l < pageSearch3.size(); l++) {
                    String url = pageSearch3.get(l).attr("href");
                    list.get(searchTwo).setUrl(url);
                    searchTwo++;
                }

                for (n = 0; n < pageSearch5.size(); n++) {
                    if (n == 0 || n % 2 == 0) {
                        String oras = pageSearch5.get(n).text();
                        list.get(indexNume).setOras(oras);
                        indexNume++;
                    }

                }

            }

            // REINITIALIZATIONS
            searchOne = 0;
            searchTwo = 0;
            searchThree = 0;
            j = 0;
            n = 0;
            k = 0;
            l = 0;
            m = 0;
            index = 0;
            indexNume = 0;
            indexare = 0;
            return list;
        } catch (Exception e) {
            e.printStackTrace();
        }

        return null;
}

起初我得到广告页面的数量,然后遍历每个页面并提取数据。如果要使用线程,应该在迭代还是在何处使用,该怎么做呢?

0 个答案:

没有答案