我是初学者。
我正在尝试抓取google play商店并导出到csv文件。 但是我收到了一条错误消息。
UnicodeEncodeError: 'cp949' codec can't encode character '\u20a9' in position 90: illegal multibyte sequence
这是我的源代码。
当我命令打印时,它可以工作。 但它在导出到csv文件时显示错误消息
请帮帮我
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import codecs
import json
import pickle
from datetime import datetime
import sys
import csv
import os
req = 'https://play.google.com/store/search?q=hana&c=apps&num=300'
response = urllib.request.urlopen(req)
the_page = response.read()
soup = BeautifulSoup(the_page)
#app_link = soup.find('a', {'class' : 'title'})
#app_url = app_link.get('href')
for div in soup.findAll( 'div', {'class' : 'details'} ):
title = div.find( 'a', {'class':'title'} )
#print(title.get('href'))
app_url = title.get('href')
app_details={}
g_app_url = 'https://play.google.com' + app_url
app_response = urllib.request.urlopen(g_app_url)
app_page = app_response.read()
soup = BeautifulSoup(app_page)
#print(soup)
#print( g_app_url )
title_div = soup.find( 'div', {'class':'document-title'} )
app_details['title'] = title_div.find( 'div' ).get_text().strip()
subtitle = soup.find( 'a', {'class' : 'document-subtitle primary'} )
app_details['developer'] = subtitle.get_text().strip()
app_details['developer_link'] = subtitle.get( 'href' ).strip()
price_buy_span = soup.find( 'span', {'class' : 'price buy'} )
price = price_buy_span.find_all( 'span' )[-1].get_text().strip()
price = price[:-4].strip() if price != 'Install' else 'Free'
app_details['price'] = price
rating_value_meta = soup.find( 'meta', {'itemprop' : 'ratingValue'} )
app_details['rating'] = rating_value_meta.get( 'content' ).strip()
reviewers_count_meta = soup.find( 'meta', {'itemprop' : 'ratingCount'} )
app_details['reviewers'] = reviewers_count_meta.get( 'content' ).strip()
num_downloads_div = soup.find( 'div', {'itemprop' : 'numDownloads'} )
if num_downloads_div: app_details['downloads'] = num_downloads_div.get_text().strip()
date_published_div = soup.find( 'div', {'itemprop' : 'datePublished'} )
app_details['date_published'] = date_published_div.get_text().strip()
operating_systems_div = soup.find( 'div', {'itemprop' : 'operatingSystems'} )
app_details['operating_system'] = operating_systems_div.get_text().strip()
content_rating_div = soup.find( 'div', {'itemprop' : 'contentRating'} )
app_details['content_rating'] = content_rating_div.get_text().strip()
category_span = soup.find( 'span', {'itemprop' : 'genre'} )
app_details['category'] = category_span.get_text()
#print(app_details)
with open('result.csv', 'w') as f: # Just use 'w' mode in 3.x
w = csv.DictWriter(f, app_details.keys())
w.writeheader()
w.writerow(app_details)
答案 0 :(得分:3)
Python 3以语言环境默认编码打开文本文件;如果该编码无法处理您尝试写入的Unicode值,请选择其他编解码器:
with open('result.csv', 'w', encoding='UTF-8', newline='') as f:
它将任何unicode字符串编码为UTF-8,这是一种可以处理所有Unicode标准的编码。
请注意,csv
模块建议您使用newline=''
打开文件,以防止换行。
您还需要在for
循环之外一次打开文件:
with open('result.csv', 'w') as f: # Just use 'w' mode in 3.x
fields = ('title', 'developer', 'developer_link', 'price', 'rating', 'reviewers',
'downloads', 'date_published', 'operating_system', 'content_rating',
'category')
w = csv.DictWriter(f, )
w.writeheader()
for div in soup.findAll( 'div', {'class' : 'details'} ):
#
# build app_details
#
w.writerow(app_details)