def get_user_data(self,start_url):
html = requests.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
contents = selector.xpath('//span[@class="ctt"]/text()')
times = selector.xpath('//span[@class="ct"]/text()')
data = {}
for each_text in contents:
data['content'] = each_text.encode().decode('utf-8').replace('\u200b','')
for each_time in times:
month_day, time, device = each_time.split(maxsplit=2)
data['mobile_phone'] = device
data['create_time'] = month_day + time
data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.mysql.insert(data)
我想将数据插入数据库,但是数据['content']字段会重复,我该如何修改呢?
答案 0 :(得分:1)
您应该并行遍历contents
和times
,而不是一个接一个地遍历。尝试使用zip
。
def get_user_data(self,start_url):
html = requests.get(url=start_url,headers=self.headers,cookies=self.cookies).content
selector = etree.fromstring(html,etree.HTMLParser(encoding='utf-8'))
contents = selector.xpath('//span[@class="ctt"]/text()')
times = selector.xpath('//span[@class="ct"]/text()')
for each_text, each_time in zip(contents, times):
data = {}
data['content'] = each_text.encode().decode('utf-8').replace('\u200b','')
month_day, time, device = each_time.split(maxsplit=2)
data['mobile_phone'] = device
data['create_time'] = month_day + time
data['crawl_time'] = datetime.strftime(datetime.now(),'%Y-%m-%d %H:%M:%S')
self.mysql.insert(data)