Python:对Web解析器应​​用多处理

时间:2018-09-29 06:06:49

标签: python python-requests multiprocessing web-crawler

我正在尝试引入多处理功能,以帮助加快请求和Web解析的运行时间。不幸的是,我返回一个空的数据框。似乎每个进程都没有使用__init __

中的共享变量

有人对这个问题有什么见识吗?我尝试使用apply_async,但这似乎也不起作用。我也做了一些研究,对BaseManager进行了一些研究,但是我不确定如何应用它。

更新:我尝试了多线程处理,该方法最初似乎可以正常运行,但后来开始出错。不知道是什么问题。

class WebCrawler(object):

def __init__(self, fileDir, drillDown):
    self.fileDir = fileDir
    self.baseurls = None
    self.parsedurls = []
    self.failedurls = []
    self.drillDown = drillDown
    self.idx = 0
    self.urlCollection = None
    self.data = pd.DataFrame(columns=['url', 'content', 'frontier', 'date'])
    self.db = db()
    self.sublinks = []

def getData(self):
    print(self.data)

def getBaseURLs(self):
    try:
        URLList = pd.read_csv(self.fileDir).ix[:,0].tolist()
    except:
        print('Cannot load CSV file')

    self.baseurls = URLList
    self.urlCollection = {0: self.baseurls}

def getURLs(self):
    return self.urlCollection[self.idx]

def crawlPages(self, url):
    try:
        if url not in self.parsedurls:
            print('\n Parsing %s' % (str(url)))
            r = requests.get(url, stream=True, timeout=5)
            soup = BeautifulSoup(r.text, 'html.parser')
            content = self.getVisibleText(soup)
            dt = datetime.now().strftime("%B %d, %Y %I:%M%p")
            # self.db.sql_insert(url,content,self.idx,dt)
            row = pd.Series({'url': url, 'content': content, 'frontier': self.idx, 'date': dt}).to_frame().T
            self.data = pd.concat([self.data, row])
            subs = self.getSubLinks(soup)
            self.sublinks = list(set(self.sublinks + subs))
            print('\n Successfully Parsed')
            self.parsedurls.append(url)

    except Exception:
        print('\n Error: Could not access this webpage: ', str(url))
        self.failedurls.append(url)
        print('\n Moving on to next web page')

def getSubLinks(self, html):
    try:
        sublinks = [str(sublink.get('href')) for sublink in html.find_all('a')]
    except:
        return []
    regex = re.compile('http.*.com')
    sublinks = [sublink for sublink in sublinks if regex.match(sublink)]
    return sublinks

@staticmethod
def tagVisible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False

    return True


def getVisibleText(self, html):
    alltext = html.find_all(text=True)
    visibleText = filter(self.tagVisible, alltext)
    alltext = u" ".join(t.strip() for t in visibleText)
    return alltext

def main(self):
    self.db.create_table()
    self.getBaseURLs()

    while self.idx <= self.drillDown:
        urls = self.getURLs()
        threads = [threading.Thread(target=self.crawlPages, args=(url,)) for url in urls]
        for thread in threads:
            thread.start()
        for thread in threads:
            thread.join()
        self.idx += 1
        self.urlCollection[self.idx] = self.sublinks
        self.sublinks = []

    self.db.close_connection

0 个答案:

没有答案