我一直在尝试使用此代码从xhamster通道中抓取数据以进行研究
import json
from multiprocessing.dummy import Pool as ThreadPool
from lxml import html
from util import req
def get_channel_urls(url):
r = req(url)
tree = html.fromstring(r.text)
print("Done", url)
return [x.attrib['href'] for x in tree.xpath('//div[@class="item"]/a')]
def write_channel_data(url):
r = req(url)
html_text = r.text
tree = html.fromstring(html_text)
json_data = json.loads(
tree.xpath('//script[@id="initials-script"]/text()')[0].strip().split("window.initials =")[1][:-1].strip())
with open("channel_html/{}".format(json_data['sponsorChannel']['inurl']), 'w', encoding='utf-8') as outfile:
outfile.write(html_text)
print("Written data for:", url)
def main():
letters = '0abcdefghijklmnopqrstuvqxyz'
index_urls = ['https://xhamster.com/channels/all/{}'.format(index_letter) for index_letter in letters]
index_urls.extend(['https://xhamster.com/gay/channels/all/{}'.format(index_letter) for index_letter in letters])
index_urls.extend(['https://xhamster.com/shemale/channels/all/{}'.format(index_letter) for index_letter in letters])
channel_urls = []
for url in index_urls:
channel_urls.extend(get_channel_urls(url))
with open('channel_urls', 'w') as channel_url_backup_file:
channel_url_backup_file.write("\n".join(channel_urls))
# with open('channel_urls') as i: # THIS IS TO READ A PRE-DOWNLOADED URL FILE
# channel_urls = [url.strip() for url in i.read().split()]
with ThreadPool(processes=10) as pool:
pool.map(write_channel_data, channel_urls)
if __name__ == '__main__':
main()
它确实可以工作一段时间,但随后出现此错误。该错误显然是在main()函数中,但我想不出如何解决它 IndexError: list out of index