从网页抓取时出现IndexError

时间:2020-09-15 02:37:52

标签: python web-scraping web-crawler

我一直在尝试使用此代码从xhamster通道中抓取数据以进行研究

import json
from multiprocessing.dummy import Pool as ThreadPool

from lxml import html

from util import req


def get_channel_urls(url):
    r = req(url)
    tree = html.fromstring(r.text)
    print("Done", url)
    return [x.attrib['href'] for x in tree.xpath('//div[@class="item"]/a')]

def write_channel_data(url):
    r = req(url)
    html_text = r.text
    tree = html.fromstring(html_text)
    json_data = json.loads(
        tree.xpath('//script[@id="initials-script"]/text()')[0].strip().split("window.initials =")[1][:-1].strip())
    with open("channel_html/{}".format(json_data['sponsorChannel']['inurl']), 'w', encoding='utf-8') as outfile:
        outfile.write(html_text)
    print("Written data for:", url)


def main():
    letters = '0abcdefghijklmnopqrstuvqxyz'
    index_urls = ['https://xhamster.com/channels/all/{}'.format(index_letter) for index_letter in letters]
    index_urls.extend(['https://xhamster.com/gay/channels/all/{}'.format(index_letter) for index_letter in letters])
    index_urls.extend(['https://xhamster.com/shemale/channels/all/{}'.format(index_letter) for index_letter in letters])
    channel_urls = []
    for url in index_urls:
        channel_urls.extend(get_channel_urls(url))

    with open('channel_urls', 'w') as channel_url_backup_file:
        channel_url_backup_file.write("\n".join(channel_urls))

    # with open('channel_urls') as i:  # THIS IS TO READ A PRE-DOWNLOADED URL FILE
    #     channel_urls = [url.strip() for url in i.read().split()]

    with ThreadPool(processes=10) as pool:
        pool.map(write_channel_data, channel_urls)


if __name__ == '__main__':
    main()

它确实可以工作一段时间,但随后出现此错误。该错误显然是在main()函数中,但我想不出如何解决它 IndexError: list out of index

0 个答案:

没有答案