过去几天我一直在乱用python,在跟随Edmund Martin的tutorial时遇到了一个问题:
我想将我删除的名称和标题附加到CSV文件中。 唯一的问题是我抓取的数据没有出现在文件中。
你能否向我解释为什么只有"排名" "描述"和"标题"正在写入CSV文件而不是实际数据。另外我该如何解决?
以下是我在教程网站上找到的代码,其中包含我添加的最后三行:
import requests
from bs4 import BeautifulSoup
import time
import csv
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 '
'Safari/537.36'}
def fetch_results(search_term, number_results, language_code):
assert isinstance(search_term, str), 'Search term must be a string'
assert isinstance(number_results, int), 'Number of results must be an integer'
escaped_search_term = search_term.replace(' ', '+')
google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(
escaped_search_term, number_results, language_code)
response = requests.get(google_url, headers=USER_AGENT)
response.raise_for_status()
return search_term, response.text
def parse_results(html, keyword):
soup = BeautifulSoup(html, 'html.parser')
found_results = []
rank = 1
result_block = soup.find_all('div', attrs={'class': 'g'})
for result in result_block:
link = result.find('a', href=True)
title = result.find('h3', attrs={'class': 'r'})
description = result.find('span', attrs={'class': 'st'})
if link and title:
link = link['href']
title = title.get_text()
description = description.get_text()
if link != '#':
found_results.append({
'rank': rank,
'title': title,
'description': description
})
rank += 1
return found_results
def scrape_google(search_term, number_results, language_code):
try:
keyword, html = fetch_results(search_term, number_results, language_code)
results = parse_results(html, keyword)
return results
except AssertionError:
raise Exception("Incorrect arguments parsed to function")
except requests.HTTPError:
raise Exception("You appear to have been blocked by Google")
except requests.RequestException:
raise Exception("Appears to be an issue with your connection")
if __name__ == '__main__':
keywords = ['python']
data = []
for keyword in keywords:
try:
results = scrape_google(keyword,2, "en")
for result in results:
data.append(result)
except Exception as e:
print(e)
finally:
time.sleep(1)
print(data)
with open('python_scrape.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(data)
csvFile.close()import requests
from bs4 import BeautifulSoup
import time
import csv
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 '
'Safari/537.36'}
def fetch_results(search_term, number_results, language_code):
assert isinstance(search_term, str), 'Search term must be a string'
assert isinstance(number_results, int), 'Number of results must be an integer'
escaped_search_term = search_term.replace(' ', '+')
google_url = 'https://www.google.com/search?q={}&num={}&hl={}'.format(
escaped_search_term, number_results, language_code)
response = requests.get(google_url, headers=USER_AGENT)
response.raise_for_status()
return search_term, response.text
def parse_results(html, keyword):
soup = BeautifulSoup(html, 'html.parser')
found_results = []
rank = 1
result_block = soup.find_all('div', attrs={'class': 'g'})
for result in result_block:
link = result.find('a', href=True)
title = result.find('h3', attrs={'class': 'r'})
description = result.find('span', attrs={'class': 'st'})
if link and title:
link = link['href']
title = title.get_text()
description = description.get_text()
if link != '#':
found_results.append({
'rank': rank,
'title': title,
'description': description
})
rank += 1
return found_results
def scrape_google(search_term, number_results, language_code):
try:
keyword, html = fetch_results(search_term, number_results, language_code)
results = parse_results(html, keyword)
return results
except AssertionError:
raise Exception("Incorrect arguments parsed to function")
except requests.HTTPError:
raise Exception("You appear to have been blocked by Google")
except requests.RequestException:
raise Exception("Appears to be an issue with your connection")
if __name__ == '__main__':
keywords = ['python']
data = []
for keyword in keywords:
try:
results = scrape_google(keyword,2, "en")
for result in results:
data.append(result)
except Exception as e:
print(e)
finally:
time.sleep(1)
print(data)
with open('python_scrape.csv', 'w') as csvFile:
writer = csv.writer(csvFile)
writer.writerows(data)
csvFile.close()
感谢您的帮助!
答案 0 :(得分:0)
def parse_results(html, keyword):
# code ....
for result in result_block:
link = result.find('a', href=True) # here you get links
title = result.find('h3', attrs={'class': 'r'}) # here you get title
description = result.find('span', attrs={'class': 'st'}) # here you get description
# if you want something to search here
# for example you can print(result) here an see what data have result variable
# and after that parse that data and save in variable for example
# body = result.find('h1', attrs={'class': 'h1'})
if link and title:
link = link['href']
title = title.get_text()
description = description.get_text()
# here we take text from that body
# body = body.get_text()
if link != '#':
found_results.append({
'rank': rank,
'title': title,
'description': description,
# and here we append to list
'body': body
})
rank += 1
return found_results
答案 1 :(得分:0)
因为你使用的是csv.writer.writerows(以's'结尾,行是复数形式)而不是writerow,所以csv writer需要一个“可迭代对象”列表,它将它视为行。
你的main()函数使用scrape_google()返回一个词典列表,这些词典都像{'rank':rank,'title':title,'description':description}。
Python通过返回每个键来遍历字典,因此作者所看到的只是每行中的“rank”,“title”和“description”键。
解决问题的最快方法是添加一行
results = [[j[i] for i in j] for j in results]
在你的“with open('python_scrape.csv'...”行之前。这使用了列表理解,这对于作为一个新的python用户来说是一件好事。
修复代码的更好方法是确保它正在构建一个列表,列表要写入csv而不是字典列表。