使用HtmlUnit 2.35.0进行网页抓取

时间:2019-07-06 12:42:42

标签: java web-scraping htmlunit

我正在编写解析器。一页不会加载为完整版本(并非所有脚本都已执行)。如果我用浏览器加载page(https://hh.ru/employer/negotiations/change_topic?r=5598e4e9000318fe590000bde1526e666d5968)很好,但是用htmlunit却不能加载一些脚本(我认为)。 Firefox页面已启用按钮。但是加载了htmluntit的同一页面禁用了按钮的属性,因此我无法提交(即使我删除了此attr,也已发送但无法正常工作)。所以,我不明白为什么htmlunit页面不起作用。

我的设置:

WebClient webClient = new WebClient(BrowserVersion.FIREFOX_60);
webClient.getCookieManager().setCookiesEnabled(true);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setTimeout(35000);
webClient.getOptions().setUseInsecureSSL(true);
webClient.getOptions().setRedirectEnabled(true);

//overcome problems in js
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setPrintContentOnFailingStatusCode(false);
webClient.setCssErrorHandler(new SilentCssErrorHandler());
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.setAjaxController(new NicelyResynchronizingAjaxController());
webClient.waitForBackgroundJavaScript(100000);
webClient.waitForBackgroundJavaScriptStartingBefore(100000);
getWebClient().setAlertHandler(new CollectingAlertHandler(new ArrayList<>()));
webClient.getOptions().setCssEnabled(true);

WebRequest requestSettings = new WebRequest(url, HttpMethod.GET);
requestSettings.setAdditionalHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
requestSettings.setAdditionalHeader("Accept-Encoding", "gzip, deflate, br");
requestSettings.setAdditionalHeader("Accept-Language", "en-US,en;q=0.9");
requestSettings.setAdditionalHeader("Connection", "keep-alive");
requestSettings.setAdditionalHeader("Host", "hh.ru");
requestSettings.setAdditionalHeader("TE", "Trailers");
requestSettings.setAdditionalHeader("Upgrade-Insecure-Requests", "1");
requestSettings.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36");

webClient.getPage(requestSettings);

当我尝试调用myButton.click()时,出现此错误:

12:57:25,905 [Thread-7] ERROR com.gargoylesoftware.htmlunit.javascript.DefaultJavaScriptErrorListener - Error during JavaScript execution
======= EXCEPTION START ========
Exception class=[net.sourceforge.htmlunit.corejs.javascript.EvaluatorException]
com.gargoylesoftware.htmlunit.ScriptException: syntax error (script in https://hh.ru/employer/negotiations/change_topic from (2, 454) to (39, 18)#34)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:885)
    ...
    at com.gargoylesoftware.htmlunit.html.DomElement.click(DomElement.java:859)
    at com.headhunter.hhelper.Headhunter.invite(Headhunter.java:233)
    at com.headhunter.hhelper.Headhunter.doInvite(Headhunter.java:150)
    at com.headhunter.hhelper.SearchController$2.run(SearchController.java:126)
    at java.lang.Thread.run(Thread.java:745)
Caused by: net.sourceforge.htmlunit.corejs.javascript.EvaluatorException: syntax error (script in https://hh.ru/employer/negotiations/change_topic from (2, 454) to (39, 18)#34)
    at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory$HtmlUnitErrorReporter.error(HtmlUnitContextFactory.java:420)
    at net.sourceforge.htmlunit.corejs.javascript.Parser.addError(Parser.java:259)
    ...
    at net.sourceforge.htmlunit.corejs.javascript.Context.compileString(Context.java:1584)
    at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory$TimeoutContext.compileString(HtmlUnitContextFactory.java:222)
    at net.sourceforge.htmlunit.corejs.javascript.Context.compileString(Context.java:1573)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$1.doRun(JavaScriptEngine.java:707)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:870)
    ... 42 more
Enclosed exception: 
net.sourceforge.htmlunit.corejs.javascript.EvaluatorException: syntax error (script in https://hh.ru/employer/negotiations/change_topic from (2, 454) to (39, 18)#34)
    at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory$HtmlUnitErrorReporter.error(HtmlUnitContextFactory.java:420)
    at net.sourceforge.htmlunit.corejs.javascript.Parser.addError(Parser.java:259)
    ...
    at com.gargoylesoftware.htmlunit.html.DomElement.click(DomElement.java:859)
    at com.headhunter.hhelper.Headhunter.invite(Headhunter.java:233)
    at com.headhunter.hhelper.Headhunter.doInvite(Headhunter.java:150)
    at com.headhunter.hhelper.SearchController$2.run(SearchController.java:126)
    at java.lang.Thread.run(Thread.java:745)
== CALLING JAVASCRIPT ==

            window.bloko = {
                fontUrl: '/'
            };
            window.globalVars = {
                locale: '',
                country: '',
                area: '',
                build: '',
                lang: '' || 'RU',
                requestId: '',
                sentryDSN: '',
                siteId: '' || '1',
                staticHost: '',
                hhcdnHost: '',
                apiHost: '',
                timeStamp: '',
                userType: '' || 'anonymous',
                cryptedUserId: '',
                employerState: '',
                vishnuIframeSrc: '',
                login: "",
                userId: '',
                hhid: '',


                autotestsComponentsInitEnd: false,


                performanceObserverEnabled: true,

                features: {"hide_resume_photo_from_untrusted_users": true, "disable_counters": false, "sentry_logging": true, "fingerprinting_enable": true, "secure_portal_enabled": true, "secure_portal_employer_registration_only": false, "employer_extensions_to_detect": "{\"vera\": \"veraBar\", \"friendwork\": \"fwi-popup\", \"potok\": \"potok_io__chrome_extension_iframe\", \"extrasaur\": \"custom-table-iframe-div\"}", "anonymous_resume_enabled": true, "sentry_js_config": "{\r\n  \"ignorePaths\": {\r\n    \"regexps\": [\r\n      \"[\\\\da-f]+/[\\\\da-f-]+/main\\\\.js\",\r\n      \".*akamaihd\\\\.net.+$\",\r\n      \"\\\\/inj_js\\\\/common\\\\.js\",\r\n      \"fingerprintjs\",\r\n      \"ckeditor4.5\",\r\n      \"axios/lib/core/createError\"\r\n    ]\r\n  },\r\n  \"ignoreErrors\": {\r\n    \"strings\": [\r\n      \"'e.data.indexOf' is not a function\",\r\n      \"Load timeout for modules:\",\r\n      \"__gCrWeb.autofill.extractForms\",\r\n      \"HTML Parsing Error: Unable to modify the parent container element before the child element is closed\",\r\n      \"Uncaught exception: TypeError: Cannot convert 'd.body' to object\",\r\n      \"Node cannot be inserted at the specified point in the hierarchy\",\r\n      \"TypeError: \u041d\u0435\u0434\u043e\u043f\u0443\u0441\u0442\u0438\u043c\u044b\u0439 \u0432\u044b\u0437\u044b\u0432\u0430\u044e\u0449\u0438\u0439 \u043e\u0431\u044a\u0435\u043a\u0442\",\r\n      \"TypeError: Invalid calling object\",\r\n      \"TypeError: 'undefined' is not an object (evaluating 'doc.forms')\",\r\n      \"Uncaught exception: TypeError: Cannot convert 'a.mini' to object\",\r\n      \"window.zAdv\",\r\n      \"backbone in Function.e.Router [as extend]\",\r\n      \"this._doc.documentElement\",\r\n      \"Can't find variable: inf\",\r\n      \"SkypeClick2Call\",\r\n      \"\u0421\u0438\u043d\u0442\u0430\u043a\u0441\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u043e\u0448\u0438\u0431\u043a\u0430\",\r\n      \"Invalid or unexpected token\",\r\n      \"Unexpected token <\",\r\n      \"Blocked a frame with origin\",\r\n      \"__show__deepen\",\r\n      \"expected expression, got '<'\",\r\n      \"Cannot read property 'forms' of undefined\",\r\n      \"GM_addStyle is not defined\",\r\n      \"can't redefine non-configurable property \\\"userAgent\\\"\",\r\n      \"Can't find varfiable: auto\",\r\n      \"only one instance of babel-polyfill is allowed\",\r\n      \"this.matches is not a function\",\r\n      \"NS_ERROR_NOT_INITIALIZED\",\r\n      \"NS_ERROR_UNEXPECTED\",\r\n      \"jQuery(...).size is not a function\",\r\n      \"Unexpected token ILLEGAL\",\r\n      \"Unexpected identifier\",\r\n      \"Unexpected end of input\",\r\n      \"yndx_svtn_e\",\r\n      \"TypeError: Cannot set property 'destroySlots' of undefined\",\r\n      \"Non-Error exception captured with keys: status, statusText\",\r\n      \"SyntaxError: The string did not match the expected pattern.\",\r\n      \"The operation is insecure\",\r\n      \"No identifiers allowed directly after numeric literal\",\r\n      \"wmrzz_time2 is not defined\",\r\n      \"Request failed with status code 403\",\r\n      \"SYNTAX_ERR: DOM Exception 12\",\r\n      \"maxthon\",\r\n      \"Request aborted\"\r\n    ],\r\n    \"regexps\": [\r\n      \"^undefined$\",\r\n      \"^Syntax error$\",\r\n      \"^\u041d\u0435\u043e\u043f\u0440\u0435\u0434\u0435\u043b\u0435\u043d\u043d\u0430\u044f \u043e\u0448\u0438\u0431\u043a\u0430\\\\.$\",\r\n      \"^\u041d\u0435\u0434\u043e\u043f\u0443\u0441\u0442\u0438\u043c\u044b\u0439 \u0437\u043d\u0430\u043a$\",\r\n      \"^\\\\[object Event\\\\]$\",\r\n      \"\\\\bgST\\\\b\",\r\n      \"pixelPositionVal\",\r\n      \"\u041d\u0435\u0434\u043e\u0441\u0442\u0430\u0442\u043e\u0447\u043d\u043e \u043f\u0430\u043c\u044f\u0442\u0438 \u0434\u043b\u044f \u0437\u0430\u0432\u0435\u0440\u0448\u0435\u043d\u0438\u044f \u043e\u043f\u0435\u0440\u0430\u0446\u0438\u0438[\\\\s\\\\S]+?fingerprint2\",\r\n      \"^illegal character$\",\r\n      \"^Access is denied\\\\.\\\\s*$\",\r\n      \"^Timeout$\",\r\n      \"^Unexpected token else$\",\r\n      \"^\u041d\u0435\u0434\u043e\u0441\u0442\u0430\u0442\u043e\u0447\u043d\u043e \u043f\u0430\u043c\u044f\u0442\u0438$\",\r\n      \"^\\\\[CKEDITOR.resourceManager.load\\\\] Resource name \\\"default\\\" was not found at\",\r\n      \"can't redefine non-configurable property \\\"AceScript\\\"\",\r\n      \"\u041e\u043f\u0435\u0440\u0430\u0446\u0438\u044f \u0431\u044b\u043b\u0430 \u043e\u0442\u043c\u0435\u043d\u0435\u043d\u0430 \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u0435\u043c.\",\r\n      \"out of memory\",\r\n      \"Network Error\",\r\n      \"Loading chunk\",\r\n      \"^No error message$\",\r\n      \"^\\\"Timeout\\\"$\"\r\n    ]\r\n  }\r\n}", "vishnu_webim_integration": true, "iframe_fix_size_banners": "504,514,500,502,260,348,674,675,370,369,368,345,346", "personal_manager_rating_enabled": true, "fp_pro_enabled": true},
                variables: ,
                cssMaping: ,
                firebaseMessagingSenderId: '',
                google_dfp_sandbox: '',
            };

======= EXCEPTION END ========
12:57:25,919 [Thread-7] WARN  com.gargoylesoftware.htmlunit.html.HtmlScript - Script is not JavaScript (type: text/html, language: ). Skipping execution.

0 个答案:

没有答案