我目前正在使用python 2.7在源代码中搜索多个关键字的网站。我想将这些关键字分配并导出到导出的CSV文件中的各个列,如下所示:
然而,我的代码我得到了这个:
我的代码:
import urllib2
import csv
fieldnames = ['Website', 'Sitemap', 'Viewport', '@media']
def csv_writerheader(path):
with open(path, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
def csv_writer(domainname,Sitemap, path):
with open(path, 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
# writer.writeheader()
writer.writerow({'Website': domainname, 'Sitemap': Sitemap})
csv_output_file = 'exported_print_results.csv'
keyword1 = ['sitemap']
keyword2 = ['viewport']
keyword3 = ['@media']
csv_writerheader(csv_output_file)
f = open('top1m-edited.csv')
csv_f = csv.reader(f)
for line in f:
strdomain = line.strip()
if '.nl' in strdomain:
try:
req = urllib2.Request(strdomain.strip())
response = urllib2.urlopen(req)
html_content = response.read()
# keyword 1
for searchstring in keyword1:
if searchstring.lower() in str(html_content).lower():
print (strdomain, keyword1, 'found')
csv_writer(strdomain, 'found', csv_output_file)
else:
print (strdomain, keyword1, 'not found')
csv_writer(strdomain, 'not found', csv_output_file)
# keyword 2
for searchstring in keyword2:
if searchstring.lower() in str(html_content).lower():
print (strdomain, keyword2, 'found')
csv_writer(strdomain, 'found', csv_output_file)
else:
print (strdomain, keyword2, 'not found')
csv_writer(strdomain, 'not found', csv_output_file)
# keyword 3
for searchstring in keyword3:
if searchstring.lower() in str(html_content).lower():
print (strdomain, keyword3, 'found')
csv_writer(strdomain, 'found', csv_output_file)
else:
print (strdomain, keyword3, 'not found')
csv_writer(strdomain, 'not found', csv_output_file)
except urllib2.HTTPError:
print (strdomain, 'HTTP ERROR')
except urllib2.URLError:
print (strdomain, 'URL ERROR')
except urllib2.socket.error:
print (strdomain, 'SOCKET ERROR')
except urllib2.ssl.CertificateError:
print (strdomain, 'SSL Certificate ERROR')
f.close()
我应该如何编辑代码才能使其正常工作?
答案 0 :(得分:1)
考虑使用字典按关键字有条件地存储找到的和 not found 值,并将其传递给csv write方法。但在此之前,您的一个问题是未在csv.writer()
中指定 lineterminator ,这在Windows文本文件中往往是必需的。并尝试在一个循环例程中迭代关键字列表。
fieldnames = ['Website', 'Sitemap', 'Viewport', '@media']
def csv_writerheader(path):
with open(path, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n')
writer.writeheader()
def csv_writer(dictdata, path):
with open(path, 'a') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, lineterminator='\n')
writer.writerow(dictdata)
csv_output_file = 'exported_print_results.csv'
# LIST OF KEY WORDS (TITLE CASE TO MATCH FIELD NAMES)
keywords = ['Sitemap', 'Viewport', '@media']
csv_writerheader(csv_output_file)
with open('top1m-edited.csv', 'r') as f:
csv_f = csv.reader(f, lineterminator='\n')
for line in f:
strdomain = line.strip()
# INITIALIZE DICT
data = {'Website': strdomain}
if '.nl' in strdomain:
try:
req = urllib2.Request(strdomain.strip())
response = urllib2.urlopen(req)
html_content = response.read()
# ITERATE THROUGH EACH KEY AND UPDATE DICT
for searchstring in keywords:
if searchstring.lower() in str(html_content).lower():
print (strdomain, searchstring, 'found')
data[searchstring] = 'found'
else:
print (strdomain, searchstring, 'not found')
data[searchstring] = 'not found'
# CALL METHOD PASSING DICT AND OUTPUT FILE
csv_writer(data, csv_output_file)
except urllib.HTTPError:
print (strdomain, 'HTTP ERROR')
except urllib.URLError:
print (strdomain, 'URL ERROR')
except urllib.socket.error:
print (strdomain, 'SOCKET ERROR')
except urllib.ssl.CertificateError:
print (strdomain, 'SSL Certificate ERROR')
CSV输出
Website Sitemap Viewport @media
http://www.google.nl not found not found found
http://www.youtube.nl not found found not found
http://www.facebook.nl not found found not found
答案 1 :(得分:0)
电子表格中的默认分隔符似乎不是逗号。最有可能是TAB。您可以在导入时将分隔符更改为逗号(通常有一个允许您选择它的导入对话框),或者使用TAB作为字段分隔符从Python输出。