网站数据的结构如下:
瓷砖
- >一个
- >格
- >格
------>含量
----------->的 P
----------->的 P
----------->的 P
- >格
- >格
def grabData(url):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
my_url = url
#opening up the connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each item
tiles = page_soup.findAll('div', {'class':'category-tile-grid-item'})
for tile in tiles:
content_container = tile.findAll('div',{'class':'shop-tile__content--border-bottom'})
store_name = content_container[0].text
print(store_name)
if __name__ == '__main__':
grabData('https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m')
我希望单独检索段落值。我怎么能这样做呢?
编辑:
我已根据 orhan solak
更新了我的代码import requests
from scrapy.selector import Selector
import pandas as pd
start_urls = ["https://exchange.shopify.com/shops", "https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m", "https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m"]
def shopify_exchange_data(url):
url_list = []
category_list = []
title_list = []
value_list = []
traffic_list = []
revenue_list = []
for link in start_urls:
# Getting the webpage, creating a Response object.
response = requests.get(link,proxies=None)
# Extracting the source code of the page.
data = response.text
first_class_name = "'grid grid--equal-height'"
second_class_name = "'grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'"
# We'll iterate this second class for 24 times.
third_class_name = "'shop-tile__price'"
# this is value xpath
fourth_class_name = "'shop-tile__url heading--truncated'"
title_class_name = "'shop-tile__title heading--truncated'"
category_class_name = "'shop-tile__category heading--truncated'"
# this is name xpath
fifth_class_name = "'shop-tile__data shop-tile__data--has-graph'"
revenue_class_name = "'shop-tile__data shop-tile__data--has-graph'"
# this is traffic xpath
for i in range(1,25):
url_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//p[@class="+fourth_class_name+"])"
test_url = Selector(text=data).xpath(url_xpath).extract()[0]
url_list.append('http://'+test_url)
title_xpath = "normalize-space(//div[@class=" + first_class_name + "]//div[@class=" + second_class_name + "][" + str(i) + "]//p[@class=" + title_class_name + "])"
test_title = Selector(text=data).xpath(title_xpath).extract()[0]
title_list.append(test_title)
category_xpath = "normalize-space(//div[@class=" + first_class_name + "]//div[@class=" + second_class_name + "][" + str(i) + "]//p[@class=" + category_class_name + "])"
test_category = Selector(text=data).xpath(category_xpath).extract()[0]
category_list.append(test_category)
traffic_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]/span)"
test_traffic = Selector(text=data).xpath(traffic_xpath).extract()[0]
test_traffic = test_traffic[1:]
traffic_list.append(test_traffic)
revenue_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]/span)"
test_revenue = Selector(text=data).xpath(revenue_xpath).extract()[0]
test_revenue = test_revenue[1:]
revenue_list.append(test_revenue)
value_xpath = "normalize-space(//div[@class=" + first_class_name + "]//div[@class=" + second_class_name + "][" + str(i) + "]//div[@class=" + third_class_name + "])"
test_value = Selector(text=data).xpath(value_xpath).extract()[0]
value_list.append(test_value)
# print(len(value_list))
# print(len(name_list))
# print(len(traffic_list))
#
#
# print(value_list[:24])
# print(name_list[:24])
# print(traffic_list[:24])
df = pd.DataFrame()
df['title'] = title_list
df['url'] = url_list
df['category'] = category_list
df['traffic'] = traffic_list
df['revenue'] = revenue_list
df['price'] = value_list
df.to_csv('test.csv', sep=",")
print (df)
if __name__ == '__main__':
shopify_exchange_data(start_urls)
我现在想弄清楚如何在指标容器中的其他值中获取Revenue。我该怎么做呢?为什么选择流量而不是收入?
答案 0 :(得分:3)
我建议您使用Scrapy来提取具有特定Xpath的数据。我检查了你的代码,Xpath有点不对劲。我已经重新配置了你的代码。我将类的名称分配给变量。之后我将它们组合起来创建一个精确的xpath 。在下面的代码示例中;我创建了7个列表,分别包含价格,名称,网站类型,收入,流量,利润,广告资源价值。
import requests
from scrapy.selector import Selector
start_urls = ["https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m"]
price_list = []
name_list = []
website_type_list = []
revenue_list = []
traffic_list = []
profit_list = []
inventory_value_list = []
for link in start_urls:
# Getting the webpage, creating a Response object.
response = requests.get(link,proxies=None)
# Extracting the source code of the page.
data = response.text
first_class_name = "'grid grid--equal-height'"
second_class_name = "'grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'"
# We'll iterate this second class for 24 times.
third_class_name = "'shop-tile__price'"
# this is price xpath
fourth_class_name = "'shop-tile__url heading--truncated'"
# this is name xpath
fifth_class_name = "'shop-tile__content shop-tile__metrics-container'"
# this is table xpath (from Revenue (USD) to Inventory Value)
sixth_class_name = "'shop-tile__metric'"
# this is all four row's xpath in the table. We'll iterate this four times from revenue to Inventory Value
seventh_class_name = "'shop-tile__category heading--truncated'"
# this is website's type (automative, sports etc.)
for i in range(1,25):
price_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+third_class_name+"])"
test_price = Selector(text=data).xpath(price_xpath).extract()[0]
price_list.append(test_price)
name_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//p[@class="+fourth_class_name+"])"
test_name = Selector(text=data).xpath(name_xpath).extract()[0]
name_list.append(test_name)
website_type_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//p[@class="+seventh_class_name+"])"
website_type = Selector(text=data).xpath(website_type_xpath).extract()[0]
website_type_list.append(website_type)
revenue_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]"+"//div[@class="+sixth_class_name+"][1]"+"//span[@class='shop-tile__metric__value text-bold'][1])"
test_revenue = Selector(text=data).xpath(revenue_xpath).extract()[0]
revenue_list.append(test_revenue)
traffic_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]"+"//div[@class="+sixth_class_name+"][2]"+"//span[@class='shop-tile__metric__value text-bold'][1])"
test_traffic = Selector(text=data).xpath(traffic_xpath).extract()[0]
traffic_list.append(test_traffic)
profit_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]"+"//div[@class="+sixth_class_name+"][3]"+"//span[@class='shop-tile__metric__value text-bold'][1])"
test_profit = Selector(text=data).xpath(profit_xpath).extract()[0]
profit_list.append(test_profit)
inventory_value_xpath = "normalize-space(//div[@class="+first_class_name+"]//div[@class="+second_class_name+"]["+str(i)+"]//div[@class="+fifth_class_name+"]"+"//div[@class="+sixth_class_name+"][4]"+"//span[@class='shop-tile__metric__value text-bold'][1])"
test_inventory_value = Selector(text=data).xpath(inventory_value_xpath).extract()[0]
inventory_value_list.append(test_inventory_value)
print(len(price_list))
print(len(name_list))
print(len(website_type_list))
print(len(revenue_list))
print(len(traffic_list))
print(len(profit_list))
print(len(inventory_value_list))
输出:
24
24
24
24
24
24
24
检查列表:
print(price_list[:5])
print(name_list[:5])
print(website_type_list[:5])
print(revenue_list[:5])
print(traffic_list[:5])
print(profit_list[:5])
print(inventory_value_list[:5])
输出:
['$1,150USD', '$3,000USD', '$2,500USD', '$1,000USD', '$2,300USD']
['www.cosmicdetail.co.uk', 'prestige-timepiece.com', 'gomommyboutique.com', 'sunnysx.com', 'squishywishy.com']
['Automotive', 'Fashion and apparel', 'Toys and games', 'Fashion and apparel', 'Gifts and collectibles']
['$56', '$961', '$70', '$1.3K', '$403']
['111', '7.5K', '454', '2.8K', '2.6K']
['$50', '$1.0K', '$700', '$500', '$100']
['$1.8K', '', '', '', '']
检查收入的xpath(如果你想在控制台上试试):
revenue_xpath
输出:
"normalize-space(//div[@class='grid grid--equal-height']//div[@class='grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'][24]//div[@class='shop-tile__content shop-tile__metrics-container']//div[@class='shop-tile__metric'][1]//span[@class='shop-tile__metric__value text-bold'][1])"
答案 1 :(得分:1)
要单独获取该段落,您可以执行以下操作:
from urllib.request import urlopen
from bs4 import BeautifulSoup
weblink = 'https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m'
def grabData(url):
res = urlopen(url)
soup = BeautifulSoup(res, "html.parser")
for items in soup.find_all(class_="category-tile-grid-item"):
title = items.find(class_="shop-tile__title").text
name = items.find(class_="shop-tile__url").text
category = items.find(class_="shop-tile__category").text
print("{}\n{}\n{}\n".format(title,name,category))
if __name__ == '__main__':
grabData(weblink)
输出:
DIODE BRAND
diodebrand.us
Art and photography
Private listing #572131
URL Hidden
Gifts and collectibles