Question

我正在尝试刮擦此website。我的剪贴代码是。

from bs4 import BeautifulSoup
import re

root_tag=["article",{"class":"story"}]
image_tag=["img",{"":""},"org-src"]
header=["h3",{"class":"story-title"}]
news_tag=["a",{"":""},"href"]
txt_data=["p",{"":""}]



import requests
ua1 = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
ua2 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome'
headers = {'User-Agent': ua2,
           'Accept': 'text/html,application/xhtml+xml,application/xml;' \
                     'q=0.9,image/webp,*/*;q=0.8'}
session = requests.Session()
response = session.get("website-link", headers=headers)
webContent = response.content


bs = BeautifulSoup(webContent, 'lxml')
all_tab_data = bs.findAll(root_tag[0], root_tag[1])

output=[]
for div in all_tab_data:
    image_url = None
    div_img = str(div)
    match = re.search(r"(http(s?):)([/|.|\w|\s|-])*\.(?:jpg|gif|png|jpeg)", div_img)
    print(match)
    # match = re.search(r"([^\\s]+(\\.(?i)(jpg|png|gif|bmp))$)",div)
    if match != None:
        image_url = str(match.group(0))
    else:
        image_url = div.find(image_tag[0], image_tag[1]).get(image_tag[2])
    if image_url !=None:
        if image_url[0] == '/' and image_url[1] != '/':
            image_url = main_url + image_url
        if image_url[0] == '/' and image_url[1] == '/':
            image_url="https://" + image_url[2:]
    output.append(image_url)

它只给出一个image_url，然后给出错误AttributeError：'NoneType'对象没有属性'get'

Answer 1

您可能应该尝试重用解析库，而不是自己解析各个部分。考虑这种方法：

from bs4 import BeautifulSoup
import re

root_tag =  ["article", {"class":"story"}]
image_tag = ["img", {"":""}, "org-src"]
header =    ["h3", {"class":"story-title"}]
news_tag =  ["a", {"":""}, "href"]
txt_data =  ["p", {"":""}]



# import requests
# ua1 = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
# ua2 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit 537.36 (KHTML, like Gecko) Chrome'
# headers = {'User-Agent': ua2,
#            'Accept': 'text/html,application/xhtml+xml,application/xml;' \
#                      'q=0.9,image/webp,*/*;q=0.8'}
# session = requests.Session()
# response = session.get("https://www.reuters.com/energy-environment", headers=headers)
# webContent = response.content

# file = open('output', 'wb')
# file.write(webContent)
# file.close()
file = open('output', 'r')
webContent = file.read()


bs = BeautifulSoup(webContent, 'html.parser')
all_tab_data = bs.findAll(*root_tag)

output = []
for div in all_tab_data:
    image_url = None
    div_img = str(div)
    article_section = BeautifulSoup(div_img, 'html.parser')
    article_images = article_section.findAll(*image_tag)
    if article_images is not None:
        output.extend([i.get('org-src') for i in article_images if i and i.get('org-src') is not None])

无法使用BeautifulSoup抓取图片网址

1 个答案: