网络抓取时将多个文件写入输出-python bs4

时间:2019-07-06 15:35:22

标签: python-3.x for-loop web-scraping beautifulsoup

作为序言-我是python的新手,我的HTML技能是幼儿园水平。

因此,我试图保存this website中的引号,其中每个美国大选候选人的链接都有很多链接。

我已经设法获得了实际的代码来提取引号(在soem stackoverflow用户的帮助下),但是对于如何将这些引号写到每个候选人的单独文本文件中却一无所知。

例如,将带有Justin Amash的所有引号的第一页写入一个文件:JustinAmash.txt。 第二页,所有带有Michael Bennet的引号应写到MichaelBennet.txt(或这种形式的东西)。等等。有没有办法做到?

作为参考,要抓取页面,可以使用以下代码:

import bs4
from urllib.request import Request,urlopen as uReq, HTTPError 
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'


def make_soup(url):
    # set up known browser user agent for the request to bypass HTMLError
    req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})

    #opening up connection, grabbing the page
    uClient = uReq(req)
    page_html = uClient.read()
    uClient.close()

    #html is jumbled at the moment, so call html using soup function
    soup = soup_(page_html, "lxml") 
    return soup

# Test: print title of page
#soup.title

soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)

# open a text file and write it if it doesn't exist
file1 = open("Quotefile.txt","w")

# get list of all URLS
for links in tags:
    link = links.get('href')
    if "java" in link: 
        print("http://archive.ontheissues.org" + link[18:len(link)-3])
        main_url = "http://archive.ontheissues.org" + link[18:len(link)-3] 
        try:
            sub_soup = make_soup(main_url)
            content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access 
            #text_data = [] #This list can be used to store data related to every person
            for item in content_collexn:
                #Accept an item if it belongs to the following classes
                if(type(item) == str):
                    print(item.get_text())
                elif(item.name == "h3"):
                    #Note that over here, every h3 tagged title has a string following it
                    print(item.get_text())   
                    #Hence, grab that string too
                    print(item.next_sibling) 
                elif(item.name in ["p", "ul", "ol"]):
                    print(item.get_text())

        except HTTPError: #Takes care of missing pages and related HTTP exception
            print("[INFO] Resource not found. Skipping to next link.")

        #print(text_data)

1 个答案:

答案 0 :(得分:0)

您可以将该文本数据存储到以text_data开头的列表中。加入所有这些项目,然后写入文件:

类似这样:

import bs4
from urllib.request import Request,urlopen as uReq, HTTPError 
#Import HTTPError in order to avoid the links with no content/resource of interest
from bs4 import BeautifulSoup as soup_
import re
#define url of interest
my_url = 'http://archive.ontheissues.org/Free_Trade.htm'


def make_soup(url):
    # set up known browser user agent for the request to bypass HTMLError
    req=Request(url,headers={'User-Agent': 'Mozilla/5.0'})

    #opening up connection, grabbing the page
    uClient = uReq(req)
    page_html = uClient.read()
    uClient.close()

    #html is jumbled at the moment, so call html using soup function
    soup = soup_(page_html, "lxml") 
    return soup

# Test: print title of page
#soup.title

soup = make_soup(my_url)
tags = soup.findAll("a" , href=re.compile("javascript:pop\("))
#print(tags)

# open a text file and write it if it doesn't exist
#file1 = open("Quotefile.txt","w")

# get list of all URLS
candidates = []
for links in tags:

    link = links.get('href')
    if "java" in link: 
        #print("http://archive.ontheissues.org" + link[18:len(link)-3])
        main_url = "http://archive.ontheissues.org" + link[18:len(link)-3]
        candidate = link.split('/')[-1].split('_Free_Trade')[0]

        if candidate in candidates:
            continue
        else:
            candidates.append(candidate)

        try:
            sub_soup = make_soup(main_url)
            content_collexn = sub_soup.body.contents #Splitting up the page into contents for iterative access 
            text_data = [] #This list can be used to store data related to every person
            for item in content_collexn:
                #Accept an item if it belongs to the following classes
                if(type(item) == str):
                    #print(item.get_text())
                    text_data.append(item.get_text())
                elif(item.name == "h3"):
                    #Note that over here, every h3 tagged title has a string following it
                    #print(item.get_text()) 
                    text_data.append(item.get_text())
                    #Hence, grab that string too
                    #print(item.next_sibling) 
                    text_data.append(item.next_sibling)
                elif(item.name in ["p", "ul", "ol"]):
                    #print(item.get_text())
                    text_data.append(item.get_text())

        except HTTPError: #Takes care of missing pages and related HTTP exception
            print("[INFO] Resource not found. Skipping to next link.")
            candidates.remove(candidate)
            continue

        text_data = '\n'.join(text_data)
        with open("C:/%s.txt" %(candidate), "w") as text_file:
            text_file.write(text_data)
        print('Aquired: %s' %(candidate))