我目前正在尝试编写一个python脚本来检查google +的帖子并打印出内容
以下是我目前正在做的事情
import re
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def handle_entityref(self, name):
self.fed.append('&%s;' % name)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
from apiclient.discovery import build
service = build("plus", "v1", http = http, developerKey = "APIKEY")
activities = service.activities().list(userId = '104898479113219628100', collection = "public").execute()
items = activities.get('items')
for item in items:
content = item['object']['content']
content = content.encode('utf8')
content = re.sub('<[^<]+?>', '', content)
content = strip_tags(content)
输出如下
???I???m here to thank you for your crazy genius parenting."+Forbes??was there at the first Take Your Parents to Work Day held at our New York....
....
....
什么是???字符?他们破坏了输出。我错过了什么吗?我接近我想输出的将是
I'm here to thank you for your crazy genius parenting. +Forbes was there at the first Take Your Parents to Work Day held at our New York....
我假设???意思是“'”和??意思是“”所以我可以替换那些,但我应该这样做吗?
PS:我从这里获得了MLStripper类,所以kuddos给了原作者!