def get_user_data(self,start_url):
html = self.session.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
all_user = selector.xpath('//div[contains(@class,"c") and contains(@id,"M")]')
for i in all_user:
user_id = i.xpath('./div[1]/a[@class="nk"]/@href')[0]
content = i.xpath('./div[1]/span[1]')[0]
contents = content.xpath('string(.)')
times = i.xpath('./div/span[@class="ct"]/text()')[0]
if len(i.xpath('./div[3]')):
imgages = i.xpath('./div[2]/a/img/@src')
praise_num = i.xpath('./div[3]/a[2]/text()')
transmit_num = i.xpath('./div[3]/a[3]/text()')
elif len(i.xpath('./div[2]')):
imgages = i.xpath('./div[2]/a/img/@src')
praise_num = i.xpath('./div[2]/a[3]/text()')
transmit_num = i.xpath('./div[2]/a[4]/text()')
else :
imgages = ''
praise_num = i.xpath('./div[1]/a[2]/text()')
transmit_num = i.xpath('./div[1]/a[3]/text()')
try:
if re.search('from',times.encode().decode('utf-8')):
month_day, time, device = times.split(maxsplit=2)
self.data['mobile_phone'] = device
else:
time,device = times.split(maxsplit=1)
self.data['month_day'] = ''
self.data['create_time'] = month_day + ' ' + time
except Exception as e:
print('failure:',e)
self.data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.data['user_id'] = user_id
self.data['contents'] = contents.encode().decode('utf-8').replace('\u200b','')
self.data['imgages'] = imgages
self.data['praise_num'] = praise_num
self.data['transmit_num'] = transmit_num
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
我尝试抓取每一页数据并将其保存到数据中。但我写错了,因为我在每个页面上只保存了一段数据' a.txt'。我该怎么办?写入以在' a.txt'?
中正确保存每一页数据答案 0 :(得分:0)
写入操作在for循环之外,这就是为什么它只将最后一次迭代数据添加到文件
with open('a.txt','a',encoding='utf-8') as f:
f.write(json.dumps(self.data)+'\n')
答案 1 :(得分:-1)
您在循环的每次迭代中都会覆盖self.data
中的各种值。
相反,self.data
应该是一个列表。您应该在每次迭代中创建一个新字典,并将其附加到最后的数据中。
self.data = []
for i in all_user:
values = {}
...
values['crawl_time'] = ...
values['user_id'] = ...
...
self.data.append(values)