我正在尝试引入多处理功能,以帮助加快请求和Web解析的运行时间。不幸的是,我返回一个空的数据框。似乎每个进程都没有使用__init __
中的共享变量有人对这个问题有什么见识吗?我尝试使用apply_async,但这似乎也不起作用。我也做了一些研究,对BaseManager进行了一些研究,但是我不确定如何应用它。
更新:我尝试了多线程处理,该方法最初似乎可以正常运行,但后来开始出错。不知道是什么问题。
class WebCrawler(object):
def __init__(self, fileDir, drillDown):
self.fileDir = fileDir
self.baseurls = None
self.parsedurls = []
self.failedurls = []
self.drillDown = drillDown
self.idx = 0
self.urlCollection = None
self.data = pd.DataFrame(columns=['url', 'content', 'frontier', 'date'])
self.db = db()
self.sublinks = []
def getData(self):
print(self.data)
def getBaseURLs(self):
try:
URLList = pd.read_csv(self.fileDir).ix[:,0].tolist()
except:
print('Cannot load CSV file')
self.baseurls = URLList
self.urlCollection = {0: self.baseurls}
def getURLs(self):
return self.urlCollection[self.idx]
def crawlPages(self, url):
try:
if url not in self.parsedurls:
print('\n Parsing %s' % (str(url)))
r = requests.get(url, stream=True, timeout=5)
soup = BeautifulSoup(r.text, 'html.parser')
content = self.getVisibleText(soup)
dt = datetime.now().strftime("%B %d, %Y %I:%M%p")
# self.db.sql_insert(url,content,self.idx,dt)
row = pd.Series({'url': url, 'content': content, 'frontier': self.idx, 'date': dt}).to_frame().T
self.data = pd.concat([self.data, row])
subs = self.getSubLinks(soup)
self.sublinks = list(set(self.sublinks + subs))
print('\n Successfully Parsed')
self.parsedurls.append(url)
except Exception:
print('\n Error: Could not access this webpage: ', str(url))
self.failedurls.append(url)
print('\n Moving on to next web page')
def getSubLinks(self, html):
try:
sublinks = [str(sublink.get('href')) for sublink in html.find_all('a')]
except:
return []
regex = re.compile('http.*.com')
sublinks = [sublink for sublink in sublinks if regex.match(sublink)]
return sublinks
@staticmethod
def tagVisible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def getVisibleText(self, html):
alltext = html.find_all(text=True)
visibleText = filter(self.tagVisible, alltext)
alltext = u" ".join(t.strip() for t in visibleText)
return alltext
def main(self):
self.db.create_table()
self.getBaseURLs()
while self.idx <= self.drillDown:
urls = self.getURLs()
threads = [threading.Thread(target=self.crawlPages, args=(url,)) for url in urls]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
self.idx += 1
self.urlCollection[self.idx] = self.sublinks
self.sublinks = []
self.db.close_connection