我制作了一个简单的抓取工具,可以使用像com.viber.voip
这样的Google Play包提取CSV文件,然后转到https://play.google.com/store/apps/details?id=com.viber.voip&hl=en
等完整链接。
然后它抓取标题,发布者,下载等并存储到列表中。 问题是,当我试图将结果保存到CSV文件时,如果我使用pandas to_csv导出,则会引发错误。或者在发现某些未知字符时抛出UnicodeError。我试图添加.encode或.decode但它没有帮助。有人可以协助吗?
import bs4 as bs
import urllib.request
import pandas as pd
import csv
def searcher(bundles):
html = urllib.request.urlopen(base_url+bundles+post_url).read()
soup = bs.BeautifulSoup(html, 'html.parser')
title_app = soup.title.get_text()
publisher_name = soup.find('a', {'class':'document-subtitle primary'}).get_text()
category = soup.find('a', {'class':'document-subtitle category'}).get_text()
ratings = soup.find('meta', {'itemprop':'ratingValue'}).get('content')
reviews = soup.find('span', {'class':'reviews-num'}).get_text()
downloads = soup.find('div', {'itemprop':'numDownloads'}).get_text()
updated_last_time = soup.find('div', {'class':'content'}).get_text()
text = (bundles, title_app, publisher_name, category, ratings, reviews, downloads, updated_last_time)
return (text)
def store(crawled_data):
writer = csv.writer(f)
labels = ['bundles', 'title_app', 'publisher_name', 'category', 'ratings', 'reviews', 'downloads', 'updated_last_time']
writer.writerow(labels)
df = pd.DataFrame(crawled_data)
for row in df:
if row != None:
writer.writerow(row)
base_url = 'https://play.google.com/store/apps/details?id='
post_url = '&hl=en'
crawled_data = []
crawled_packages = 0
with open('links.csv', 'r') as f:
df = pd.read_csv(f)
urls = df['URLs']
for bundles in urls:
if bundles != None:
aaa = searcher(bundles)
print(crawled_packages)
crawled_packages += 1
if crawled_data != None:
crawled_data.append(aaa)
store(crawled_data)
答案 0 :(得分:0)
您可以通过指定要使用的输出文件来使用to_csv()
。在构建数据框时也指定列名:
import bs4 as bs
import urllib.request
import pandas as pd
import csv
def searcher(bundles):
html = urllib.request.urlopen(base_url+bundles+post_url).read()
soup = bs.BeautifulSoup(html, 'html.parser')
title_app = soup.title.get_text()
publisher_name = soup.find('a', {'class':'document-subtitle primary'}).get_text(strip=True)
category = soup.find('a', {'class':'document-subtitle category'}).get_text(strip=True)
ratings = soup.find('meta', {'itemprop':'ratingValue'}).get('content')
reviews = soup.find('span', {'class':'reviews-num'}).get_text(strip=True)
downloads = soup.find('div', {'itemprop':'numDownloads'}).get_text(strip=True)
updated_last_time = soup.find('div', {'class':'content'}).get_text(strip=True)
return (bundles, title_app, publisher_name, category, ratings, reviews, downloads, updated_last_time)
def store(crawled_data):
labels = ['bundles', 'title_app', 'publisher_name', 'category', 'ratings', 'reviews', 'downloads', 'updated_last_time']
df = pd.DataFrame(crawled_data, columns=labels)
df.to_csv('output.csv', index=False)
base_url = 'https://play.google.com/store/apps/details?id='
post_url = '&hl=en'
crawled_data = []
crawled_packages = 0
with open('links.csv', 'r') as f:
df = pd.read_csv(f)
urls = df['URLs']
for bundles in urls:
if bundles != None:
aaa = searcher(bundles)
print(crawled_packages)
crawled_packages += 1
if crawled_data != None:
crawled_data.append(aaa)
store(crawled_data)
这将为您提供一个output.csv
文件,其中包含:
bundles,title_app,publisher_name,category,ratings,reviews,downloads,updated_last_time
com.viber.voip,Viber Messenger - Android Apps on Google Play,Viber Media S.à r.l.,Communication,4.336112022399902,"11,016,404","500,000,000 - 1,000,000,000","March 15, 2018"