代码的用法是获取提供的URLLIST的HTTP标题。
在某些主机中出现以下错误,也在完成后,我必须强制关闭程序退出。请帮我解决这些问题。 我收到以下错误
httperror_seek_wrapper:HTTP错误404:/
#!/usr/bin/python
import os
import urllib
import workerpool
from BeautifulSoup import BeautifulSoup
from mechanize import Browser
import sys
def titleprint(url):
br = Browser()
br.set_handle_robots(False)
res = br.open(url, None, 2.5)
data = res.get_data()
soup = BeautifulSoup(data)
title = soup.find('title')
if soup.title != None:
print url, title.renderContents(), '\n'
# Initialize a pool, 5 threads in this case
pool = workerpool.WorkerPool(size=5)
# The ``Title Print`` method will be called with a line from the second
# parameter for each job.
pool.map(titleprint, open("urls.txt").readlines())
# Send shutdown jobs to all threads, and wait until all the jobs have been completed
pool.shutdown()
pool.wait()
答案 0 :(得分:1)
发现问题,我使用的是urllib而不是urllib2,无论如何正确的代码如下,但我无法修复SIGINT(CTRL + C)中断。 :(
import os
import urllib2
import socket
import workerpool
from BeautifulSoup import BeautifulSoup
from mechanize import Browser
import signal
import time
import sys
def titleprint(url):
try:
br = Browser()
br.set_handle_robots(False)
res = br.open(url, None, 2.5)
data = res.get_data()
soup = BeautifulSoup(data)
title = soup.find('title')
if soup.title != None:
print url, title.renderContents(), '\n'
else:
print "No Title Found"
except urllib2.URLError, e:
print url,"Oops, timed out?", '\n'
except socket.error,e:
print url,"Oops, timed out?", '\n'
except socket.timeout:
print url,"Oops, timed out?", '\n'
def signal_handler(signal, frame):
print('You pressed Ctrl+C!')
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
pool = workerpool.WorkerPool(size=20)
pool.map(titleprint, open("urls.txt").readlines())
pool.shutdown()
pool.wait()
print 'Processing of list completed, Cheers!!'
sys.exit(1)
print('Stop the script using Ctrl+C')
signal.pause()