Question

我需要一些帮助来调试我检查域徽标的代码。我在100万个域（a.com，a1.com，a2.com ...）上进行了示例测试，得到了70万个域的输出（缺少30万个域），这是我当前的代码

from scrapy.selector import Selector
import requests
from urllib.parse import urljoin
import os
import time
import concurrent.futures
FOUND_IMAGES = 'found.txt'
FOUND_DOMAINS = 'found_domains.txt'
SKIPPED = 'skipped.txt'
BASE_SIXTY = 'base64.txt'
NOT_FOUND_DOMAINS = 'input.txt'
XPATHS = ['//meta[contains(@content,"favico")]/@content', '//*[@type="image/x-icon"]/@href', 
          '//link[@rel="icon" and @type="image/png"]/@href']


def write_to_file(lst, filename):
    with open(filename, 'w') as f:
        for i in lst:
            f.write(f'{i}\n')


def read_file(fn):
    if os.path.isfile(fn):
        with open(fn, 'r') as f:
            temp = f.read().split('\n')
        return set(temp)
    else:
        return set()


def download_all_sites(sites):
    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
        executor.map(get_image, sites)


headers = {
    'User-Agent': 'Firefox/82.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
}
urls = read_file(NOT_FOUND_DOMAINS)
skipped_domains = read_file(SKIPPED)
base64 = read_file(BASE_SIXTY)
found = read_file(FOUND_IMAGES)
found_domains = read_file(FOUND_DOMAINS)
print(f"Initial length: {len(urls)}")
urls = urls - skipped_domains
urls = urls - base64
urls = urls - found_domains
if '' in urls:
    urls.remove('')
print(f"Final length: {len(urls)}")


def get_image(*urls2):
    for domain in urls2:
        img = ''
        url = f"http://www.{domain}"
        try:
            res = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
        except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout,requests.exceptions.TooManyRedirects):
            skipped_domains.add(domain)
            continue
        response = Selector(res)
        for x in XPATHS:
            img = response.xpath(x).extract_first()
            if not img or ("data:image" in img and len(img) < 50):
                continue
            else:
                break
        if not img:
            skipped_domains.add(domain)
            continue
        if "base64" in img:
            base64.add(domain)
            continue
        else:
            if not img.startswith('http'):
                image_url = urljoin(url, img)
            found.add(image_url)
            found_domains.add(domain)


print("\n\n\nStart of log")
start_time = time.time()
download_all_sites(urls)
duration = time.time() - start_time
print(f"Downloaded {len(urls)} in {duration} seconds")
write_to_file(found, FOUND_IMAGES)
write_to_file(base64, BASE_SIXTY)
write_to_file(skipped_domains, SKIPPED)
write_to_file(found_domains, FOUND_DOMAINS)
print("Completed")

理想情况下，如果我在同一输入上运行两次，则应该没有URL循环通过。我仔细检查了get_image函数，它不应跳过任何网址（即，所有域都将放置在各自的set中）

我正在使用concurrent.futures.ThreadPoolExecutor(max_workers=50)来加快过程，我相信它不会影响元素如何添加到集合中。

# these 3 lines should give 0 on second run of same input
urls = urls - skipped_domains
urls = urls - base64
urls = urls - found_domains

检查域img的脚本

0 个答案: