我的解析器只加载列表的第一个href

时间:2017-01-17 11:36:46

标签: python parsing lxml

我的解析器只加载列表的第一个href,但是我需要为列表加载所有href。这是我的代码:

#!/usr/bin/python
#  -*- coding: utf-8 -*-

import requests
from lxml import html
import urllib.parse
import urllib.request

class VashMagaz(object):

    RESULT = []

    def parse_vashmagaz_run(self):
        url = 'https://vashmagazin.ua/nerukhomist/kvartyry/'
        r = requests.get(url)
        res = html.fromstring(r.content)
        result = res.xpath(u'//*[contains(text(), "120")]/@href')
        num = self._get_page_num(result[0])
        result = self.get_page_data(num)
        return result

    def get_page_data(self, num):
        url = 'https://vashmagazin.ua/nerukhomist/kvartyry/?item_price1=&item_price2=&page={}'
        for i in range(1, num):
            r = requests.get(url.format(i))
            self.get_all(r.content)
        return self.RESULT

    def _get_page_num(self,href):
        result = urllib.parse.urlparse(href)
        result = urllib.parse.parse_qs(result.query)
        return int(result['page'][0])

    def get_all(self, data):
        data = self._get_desc(data)
        for key, i in enumerate(data):
            text = i.xpath('.//h3[@class="ner_h3"]/a/text()')[key]
            href = i.xpath('.//h3[@class="ner_h3"]/a/@href')[key]
            self.RESULT.append({'text': text,
                                'href': 'https://vashmagazin.ua/' + href,
                                })


    def _get_desc(self, data):
        return self.get_from_xpath(data, '//*[@id="price"]')

    def get_from_xpath(self, data, xpath):
        res = html.fromstring(data)
        return res.xpath(xpath)

if __name__ == '__main__':
    magaz = VashMagaz()
    magaz.parse_vashmagaz_run()
    msg = u'Subject: Квартири'+"\n"

    for res in magaz.RESULT:
        for k, i in res.items():
            msg+=str(res[k]).strip()+"\n"
        msg+='-------------------------------'+'\n'

    print(msg)

0 个答案:

没有答案