正在获取AttributeError:“ NoneType”对象没有属性“文本”(网络抓取)

时间:2020-10-28 06:37:51

标签: python selenium web-scraping beautifulsoup google-colaboratory

这是我关于网络抓取的案例研究。 我在最终代码“ NoneType”对象中没有属性“文本”时遇到问题,因此我尝试使用“ getattr”功能对其进行修复,但无法正常工作。

'''

import requests
from bs4 import BeautifulSoup

url = 'https://www.birdsnest.com.au/womens/dresses'

source = requests.get(url)
soup = BeautifulSoup(source.content, 'lxml')

'''

productlist= soup.find_all('div', id='items')

'''

productlinks = []
for item in productlist:
  for link in item.find_all('a',href=True):
      productlinks.append(url + link['href'])
print(len(productlinks))

'''

productlinks = []
for x in range(1,28):
  source = requests.get(f'https://www.birdsnest.com.au/womens/dresses?_lh=1&page={x}')
  soup = BeautifulSoup(source.content, 'lxml')
  for item in productlist:
      for link in item.find_all('a',href=True):
        productlinks.append(url + link['href'])
print(productlinks)

'''

for link in productlinks:
    source = requests.get(link)
    soup = BeautifulSoup(source.content, 'lxml')

    name = soup.find('h1',class_='item-heading__name').text.strip()
    price = soup.find('p',class_='item-heading__price').text.strip()
    feature = soup.find('div',class_='tab-accordion__content active').text.strip()

    sum = {
      'name':name,
      'price':price,
      'feature':feature
          }
    print(sum)

'''

  ---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-7-d4d46558690d> in <module>()
      3     soup = BeautifulSoup(source.content, 'lxml')
      4 
----> 5     name = soup.find('h1',class_='item-heading__name').text.strip()
      6     price = soup.find('p',class_='item-heading__price').text.strip()
      7     feature = soup.find('div',class_='tab-accordion__content active').text.strip()

AttributeError: 'NoneType' object has no attribute 'text'

---------------------------------------------------------------------------

所以我试图用这种方法修复,但是没有用。

 for link in productlinks:
    source = requests.get(link)
    soup = BeautifulSoup(source.content, 'lxml')

    name = getattr(soup.find('h1',class_='item-heading__name'),'text',None)
    price = getattr(soup.find('p',class_='item-heading__price'),'text',None)
    feature = getattr(soup.find('div',class_='tab-accordion__content active'),'text',None)

    sum = {
      'name':name,
      'price':price,
      'feature':feature
          }
    print(sum)

这是输出。它只显示“ Nonetype”

{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}

1 个答案:

答案 0 :(得分:0)

首先,请始终关闭要抓取的页面的JS。然后您将意识到标记类发生了变化,而这些正是您要定位的标记类。

此外,在循环浏览页面时,请不要忘记Python的range()停止值不包含在内。也就是说,此range(1, 28)将在页面27上停止。

这是我的处理方法:

import json

import requests
from bs4 import BeautifulSoup


cookies = {
    "ServerID": "1033",
    "__zlcmid": "10tjXhWpDJVkUQL",
}

headers = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
}


def extract_info(bs: BeautifulSoup, tag: str, attr_value: str) -> list:
    return [i.text.strip() for i in bs.find_all(tag, {"itemprop": attr_value})]


all_pages = []
for page in range(1, 29):
    print(f"Scraping data from page {page}...")

    current_page = f"https://www.birdsnest.com.au/womens/dresses?page={page}"
    source = requests.get(current_page, headers=headers, cookies=cookies)
    soup = BeautifulSoup(source.content, 'html.parser')

    brand = extract_info(soup, tag="strong", attr_value="brand")
    name = extract_info(soup, tag="h2", attr_value="name")
    price = extract_info(soup, tag="span", attr_value="price")

    all_pages.extend(
        [
            {
                "brand": b,
                "name": n,
                "price": p,
            } for b, n, p in zip(brand, name, price)
        ]
    )

print(f"{all_pages}\nFound: {len(all_pages)} dresses.")

with open("all_the_dresses2.json", "w") as jf:
    json.dump(all_pages, jf, indent=4)

这为您带来JSON的所有礼服。

    {
        "brand": "boho bird",
        "name": "Prissy Dress",
        "price": "$189.95"
    },
    {
        "brand": "boho bird",
        "name": "Dandelion Dress",
        "price": "$139.95"
    },
    {
        "brand": "Lula Soul",
        "name": "Dandelion Dress",
        "price": "$179.95"
    },
    {
        "brand": "Honeysuckle Beach",
        "name": "Cotton V-Neck A-Line Splice Dress",
        "price": "$149.95"
    },
    {
        "brand": "Honeysuckle Beach",
        "name": "Lenny Pinafore",
        "price": "$139.95"
    },
and so on for the next 28 pages ...