我想抓取多个网站(来自CSV文件)并从Chrome的“Inspect Element” - 源代码中提取某些关键字(右键点击网页,然后选择Inspect Element)。< / p>
现在,我可以使用以下脚本从“View-source” - code (右键点击网页,然后通过Chrome选择View-source)中提取某些关键字:
import urllib2
import csv
fieldnames = ['Website', '@media', 'googleadservices.com/pagead/conversion.js', 'googleadservices.com/pagead/conversion_async.js']
def csv_writerheader(path):
with open(path, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n')
writer.writeheader()
def csv_writer(dictdata, path):
with open(path, 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n')
writer.writerow(dictdata)
csv_output_file = 'EXPORT_Results!.csv'
# LIST OF KEY WORDS (TITLE CASE TO MATCH FIELD NAMES)
keywords = ['@media', 'googleadservices.com/pagead/conversion.js', 'googleadservices.com/pagead/conversion_async.js']
csv_writerheader(csv_output_file)
with open('top1m-edited.csv', 'r') as f:
csv_f = csv.reader(f, lineterminator='\n')
for line in f:
strdomain = line.strip()
# INITIALIZE DICT
data = {'Website': strdomain}
if '.nl' in strdomain:
try:
req = urllib2.Request(strdomain.strip())
response = urllib2.urlopen(req)
html_content = response.read()
# ITERATE THROUGH EACH KEY AND UPDATE DICT
for searchstring in keywords:
if searchstring.lower() in str(html_content).lower():
print (strdomain, searchstring, 'found')
data[searchstring] = 'found'
else:
print (strdomain, searchstring, 'not found')
data[searchstring] = 'not found'
# CALL METHOD PASSING DICT AND OUTPUT FILE
csv_writer(data, csv_output_file)
except urllib2.HTTPError:
print (strdomain, 'HTTP ERROR')
except urllib2.URLError:
print (strdomain, 'URL ERROR')
except urllib2.socket.error:
print (strdomain, 'SOCKET ERROR')
except urllib2.ssl.CertificateError:
print (strdomain, 'SSL Certificate ERROR')
f.close()
我编写以下代码是为了从网站上获取所需的“Inspect Element” - 源代码,以便稍后使用上面的脚本提取关键字(来自CSV文件中的多个网站)。代码:
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r'C:\Users\Jacob\PycharmProjects\Testing\chromedriver_win32\chromedriver.exe')
driver.get('https://www.rocmn.nl/')
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("outerHTML")
print(source_code)
我现在要将第一个脚本与第二个脚本合并,以便仅抓取“Inspect Element” - 源代码(所有网站的在CSV中)并将结果导出为CSV文件(如第一个脚本中所示)
我完全不知道从哪里开始工作。请帮忙
答案 0 :(得分:0)
从源代码收集关键字不是正确的方法。身体部位和元标签的关键词很重要。无论你得到什么,你只需要减少到1,
private Object getTotalCount(String strKeyword) {
// TODO Getting total count for given keyword
// Setting up Javascript executor for executing javascript on a page. Make
// sure HTMLUNIDriver/Any driver having javascript enabled.
JavascriptExecutor jsExecutor = wdHTMLUnitDriver;
// System.out.println(driver.getCurrentUrl());
// Counting up keyword on body of the web page only
Object objCount = null;
try {
objCount = jsExecutor.executeScript(
"var temp = document.getElementsByTagName('body')[0].innerText;\r\nvar substrings = temp.split(arguments[0]);\r\n \r\nreturn (substrings.length);",
strKeyword);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// System.out.println(obj.toString());
if (objCount.equals(null))
return null;
// Returning total count found by javascript executor.
return objCount.toString();
}