我的解析器只加载列表的第一个href,但是我需要为列表加载所有href。这是我的代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
from lxml import html
import urllib.parse
import urllib.request
class VashMagaz(object):
RESULT = []
def parse_vashmagaz_run(self):
url = 'https://vashmagazin.ua/nerukhomist/kvartyry/'
r = requests.get(url)
res = html.fromstring(r.content)
result = res.xpath(u'//*[contains(text(), "120")]/@href')
num = self._get_page_num(result[0])
result = self.get_page_data(num)
return result
def get_page_data(self, num):
url = 'https://vashmagazin.ua/nerukhomist/kvartyry/?item_price1=&item_price2=&page={}'
for i in range(1, num):
r = requests.get(url.format(i))
self.get_all(r.content)
return self.RESULT
def _get_page_num(self,href):
result = urllib.parse.urlparse(href)
result = urllib.parse.parse_qs(result.query)
return int(result['page'][0])
def get_all(self, data):
data = self._get_desc(data)
for key, i in enumerate(data):
text = i.xpath('.//h3[@class="ner_h3"]/a/text()')[key]
href = i.xpath('.//h3[@class="ner_h3"]/a/@href')[key]
self.RESULT.append({'text': text,
'href': 'https://vashmagazin.ua/' + href,
})
def _get_desc(self, data):
return self.get_from_xpath(data, '//*[@id="price"]')
def get_from_xpath(self, data, xpath):
res = html.fromstring(data)
return res.xpath(xpath)
if __name__ == '__main__':
magaz = VashMagaz()
magaz.parse_vashmagaz_run()
msg = u'Subject: Квартири'+"\n"
for res in magaz.RESULT:
for k, i in res.items():
msg+=str(res[k]).strip()+"\n"
msg+='-------------------------------'+'\n'
print(msg)