我需要一些帮助来调试我检查域徽标的代码。我在100万个域(a.com,a1.com,a2.com ...)上进行了示例测试,得到了70万个域的输出(缺少30万个域),这是我当前的代码
from scrapy.selector import Selector
import requests
from urllib.parse import urljoin
import os
import time
import concurrent.futures
FOUND_IMAGES = 'found.txt'
FOUND_DOMAINS = 'found_domains.txt'
SKIPPED = 'skipped.txt'
BASE_SIXTY = 'base64.txt'
NOT_FOUND_DOMAINS = 'input.txt'
XPATHS = ['//meta[contains(@content,"favico")]/@content', '//*[@type="image/x-icon"]/@href',
'//link[@rel="icon" and @type="image/png"]/@href']
def write_to_file(lst, filename):
with open(filename, 'w') as f:
for i in lst:
f.write(f'{i}\n')
def read_file(fn):
if os.path.isfile(fn):
with open(fn, 'r') as f:
temp = f.read().split('\n')
return set(temp)
else:
return set()
def download_all_sites(sites):
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
executor.map(get_image, sites)
headers = {
'User-Agent': 'Firefox/82.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
}
urls = read_file(NOT_FOUND_DOMAINS)
skipped_domains = read_file(SKIPPED)
base64 = read_file(BASE_SIXTY)
found = read_file(FOUND_IMAGES)
found_domains = read_file(FOUND_DOMAINS)
print(f"Initial length: {len(urls)}")
urls = urls - skipped_domains
urls = urls - base64
urls = urls - found_domains
if '' in urls:
urls.remove('')
print(f"Final length: {len(urls)}")
def get_image(*urls2):
for domain in urls2:
img = ''
url = f"http://www.{domain}"
try:
res = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
except (requests.exceptions.ConnectionError,requests.exceptions.ReadTimeout,requests.exceptions.TooManyRedirects):
skipped_domains.add(domain)
continue
response = Selector(res)
for x in XPATHS:
img = response.xpath(x).extract_first()
if not img or ("data:image" in img and len(img) < 50):
continue
else:
break
if not img:
skipped_domains.add(domain)
continue
if "base64" in img:
base64.add(domain)
continue
else:
if not img.startswith('http'):
image_url = urljoin(url, img)
found.add(image_url)
found_domains.add(domain)
print("\n\n\nStart of log")
start_time = time.time()
download_all_sites(urls)
duration = time.time() - start_time
print(f"Downloaded {len(urls)} in {duration} seconds")
write_to_file(found, FOUND_IMAGES)
write_to_file(base64, BASE_SIXTY)
write_to_file(skipped_domains, SKIPPED)
write_to_file(found_domains, FOUND_DOMAINS)
print("Completed")
理想情况下,如果我在同一输入上运行两次,则应该没有URL循环通过。我仔细检查了get_image
函数,它不应跳过任何网址(即,所有域都将放置在各自的set
中)
我正在使用concurrent.futures.ThreadPoolExecutor(max_workers=50)
来加快过程,我相信它不会影响元素如何添加到集合中。
# these 3 lines should give 0 on second run of same input
urls = urls - skipped_domains
urls = urls - base64
urls = urls - found_domains