使用bs4在元素中单独取出元素

时间:2018-05-20 23:14:13

标签: python web-scraping beautifulsoup

网站数据的结构如下:

瓷砖

- >一个

- >格

- >格

------>含量

----------->的 P

----------->的 P

----------->的 P

- >格

- >格

def grabData(url):

    from urllib.request import urlopen as uReq
    from bs4 import BeautifulSoup as soup
    import pandas as pd
    import numpy as np


    my_url = url

    #opening up the connection, grabbing the page
    uClient = uReq(my_url)
    page_html = uClient.read()
    uClient.close()

    #html parsing
    page_soup = soup(page_html, "html.parser")

    #grabs each item
    tiles = page_soup.findAll('div', {'class':'category-tile-grid-item'})

    for tile in tiles:
        content_container = tile.findAll('div',{'class':'shop-tile__content--border-bottom'})
        store_name = content_container[0].text
        print(store_name)

if __name__ == '__main__':
    grabData('https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m')

我希望单独检索段落值。我怎么能这样做呢?

编辑:

我已根据 orhan solak

更新了我的代码
import requests
from scrapy.selector import Selector
import pandas as pd

start_urls = ["https://exchange.shopify.com/shops", "https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m", "https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m"]

def shopify_exchange_data(url):

    url_list = []
    category_list = []
    title_list = []
    value_list = []
    traffic_list = []
    revenue_list = []

    for link in start_urls:
       # Getting the webpage, creating a Response object.
        response = requests.get(link,proxies=None)
        # Extracting the source code of the page.
        data = response.text

        first_class_name = "'grid grid--equal-height'"
        second_class_name = "'grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'"
        # We'll iterate this second class for 24 times.
        third_class_name = "'shop-tile__price'"
        # this is value xpath
        fourth_class_name = "'shop-tile__url heading--truncated'"
        title_class_name = "'shop-tile__title heading--truncated'"
        category_class_name = "'shop-tile__category heading--truncated'"
        # this is name xpath
        fifth_class_name = "'shop-tile__data shop-tile__data--has-graph'"
        revenue_class_name = "'shop-tile__data shop-tile__data--has-graph'"
        # this is traffic xpath
        for i in range(1,25):

            url_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//p[@class="+fourth_class_name+"])"
            test_url = Selector(text=data).xpath(url_xpath).extract()[0]
            url_list.append('http://'+test_url)

            title_xpath = "normalize-space(//div[@class=" + first_class_name + "]//div[@class=" + second_class_name + "][" + str(i) + "]//p[@class=" + title_class_name + "])"
            test_title = Selector(text=data).xpath(title_xpath).extract()[0]
            title_list.append(test_title)

            category_xpath = "normalize-space(//div[@class=" + first_class_name + "]//div[@class=" + second_class_name + "][" + str(i) + "]//p[@class=" + category_class_name + "])"
            test_category = Selector(text=data).xpath(category_xpath).extract()[0]
            category_list.append(test_category)

            traffic_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]/span)"
            test_traffic = Selector(text=data).xpath(traffic_xpath).extract()[0]
            test_traffic = test_traffic[1:]
            traffic_list.append(test_traffic)

            revenue_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]/span)"
            test_revenue = Selector(text=data).xpath(revenue_xpath).extract()[0]
            test_revenue = test_revenue[1:]
            revenue_list.append(test_revenue)

            value_xpath = "normalize-space(//div[@class=" + first_class_name + "]//div[@class=" + second_class_name + "][" + str(i) + "]//div[@class=" + third_class_name + "])"
            test_value = Selector(text=data).xpath(value_xpath).extract()[0]
            value_list.append(test_value)

    # print(len(value_list))
    # print(len(name_list))
    # print(len(traffic_list))
    #
    #
    # print(value_list[:24])
    # print(name_list[:24])
    # print(traffic_list[:24])

    df = pd.DataFrame()
    df['title'] = title_list
    df['url'] = url_list
    df['category'] = category_list
    df['traffic'] = traffic_list
    df['revenue'] = revenue_list
    df['price'] = value_list
    df.to_csv('test.csv', sep=",")

    print (df)

if __name__ == '__main__':
    shopify_exchange_data(start_urls)

我现在想弄清楚如何在指标容器中的其他值中获取Revenue。我该怎么做呢?为什么选择流量而不是收入?

2 个答案:

答案 0 :(得分:3)

我建议您使用Scrapy来提取具有特定Xpath的数据。我检查了你的代码,Xpath有点不对劲。我已经重新配置了你的代码。我将类的名称分配给变量。之后我将它们组合起来创建一个精确的xpath 。在下面的代码示例中;我创建了7个列表,分别包含价格名称网站类型收入流量利润广告资源价值

import requests
from scrapy.selector import Selector

start_urls = ["https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m"]
price_list = []
name_list = []
website_type_list = []

revenue_list = []
traffic_list = []
profit_list = []
inventory_value_list = []

for link in start_urls:
   # Getting the webpage, creating a Response object.
    response = requests.get(link,proxies=None)
    # Extracting the source code of the page.
    data = response.text

    first_class_name = "'grid grid--equal-height'"
    second_class_name = "'grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'"
    # We'll iterate this second class for 24 times.

    third_class_name = "'shop-tile__price'"
    # this is price xpath

    fourth_class_name = "'shop-tile__url heading--truncated'"
    # this is name xpath

    fifth_class_name = "'shop-tile__content shop-tile__metrics-container'"
    # this is table xpath (from Revenue (USD) to Inventory Value)

    sixth_class_name = "'shop-tile__metric'"
    # this is all four row's xpath in the table. We'll iterate this four times from revenue to Inventory Value

    seventh_class_name = "'shop-tile__category heading--truncated'"
    # this is website's type (automative, sports etc.)

    for i in range(1,25):

        price_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+third_class_name+"])"
        test_price = Selector(text=data).xpath(price_xpath).extract()[0]
        price_list.append(test_price)

        name_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//p[@class="+fourth_class_name+"])"
        test_name = Selector(text=data).xpath(name_xpath).extract()[0]
        name_list.append(test_name)

        website_type_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//p[@class="+seventh_class_name+"])"
        website_type = Selector(text=data).xpath(website_type_xpath).extract()[0]
        website_type_list.append(website_type)

        revenue_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]"+"//div[@class="+sixth_class_name+"][1]"+"//span[@class='shop-tile__metric__value text-bold'][1])"
        test_revenue = Selector(text=data).xpath(revenue_xpath).extract()[0]
        revenue_list.append(test_revenue)

        traffic_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]"+"//div[@class="+sixth_class_name+"][2]"+"//span[@class='shop-tile__metric__value text-bold'][1])"
        test_traffic = Selector(text=data).xpath(traffic_xpath).extract()[0]
        traffic_list.append(test_traffic)

        profit_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]"+"//div[@class="+sixth_class_name+"][3]"+"//span[@class='shop-tile__metric__value text-bold'][1])"
        test_profit = Selector(text=data).xpath(profit_xpath).extract()[0]
        profit_list.append(test_profit)

        inventory_value_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]"+"//div[@class="+sixth_class_name+"][4]"+"//span[@class='shop-tile__metric__value text-bold'][1])"
        test_inventory_value = Selector(text=data).xpath(inventory_value_xpath).extract()[0]
        inventory_value_list.append(test_inventory_value)


print(len(price_list))
print(len(name_list))
print(len(website_type_list))
print(len(revenue_list))
print(len(traffic_list))
print(len(profit_list))
print(len(inventory_value_list))

输出:

24
24
24
24
24
24
24

检查列表:

print(price_list[:5])
print(name_list[:5])
print(website_type_list[:5])
print(revenue_list[:5])
print(traffic_list[:5])
print(profit_list[:5])
print(inventory_value_list[:5])

输出:

['$1,150USD', '$3,000USD', '$2,500USD', '$1,000USD', '$2,300USD']
['www.cosmicdetail.co.uk', 'prestige-timepiece.com', 'gomommyboutique.com', 'sunnysx.com', 'squishywishy.com']
['Automotive', 'Fashion and apparel', 'Toys and games', 'Fashion and apparel', 'Gifts and collectibles']
['$56', '$961', '$70', '$1.3K', '$403']
['111', '7.5K', '454', '2.8K', '2.6K']
['$50', '$1.0K', '$700', '$500', '$100']
['$1.8K', '', '', '', '']

检查收入的xpath(如果你想在控制台上试试):

revenue_xpath

输出:

"normalize-space(//div[@class='grid grid--equal-height']//div[@class='grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'][24]//div[@class='shop-tile__content shop-tile__metrics-container']//div[@class='shop-tile__metric'][1]//span[@class='shop-tile__metric__value text-bold'][1])"

答案 1 :(得分:1)

要单独获取该段落,您可以执行以下操作:

from urllib.request import urlopen
from bs4 import BeautifulSoup

weblink = 'https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m'

def grabData(url):
    res = urlopen(url)
    soup = BeautifulSoup(res, "html.parser")

    for items in soup.find_all(class_="category-tile-grid-item"):
        title = items.find(class_="shop-tile__title").text
        name = items.find(class_="shop-tile__url").text
        category = items.find(class_="shop-tile__category").text
        print("{}\n{}\n{}\n".format(title,name,category))

if __name__ == '__main__':
    grabData(weblink)

输出:

DIODE BRAND
diodebrand.us
Art and photography

Private listing #572131
URL Hidden
Gifts and collectibles