from collections import defaultdict
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
r= requests.get("http://www.walmart.com/search/?query=marvel&cat_id=4096_530598")
r.content
soup = BeautifulSoup(r.content)
g_data = soup.find_all("div", {"class" : "tile-content"})
data=defaultdict(list)
for tile in g_data:
#the "tile" value in g_data contains what you are looking for...
#find the product titles
try:
title = tile.find("a","js-product-title")
data['Product Title'].append(title.text)
except:
data['Product Title'].append("")
#find the prices
try:
price = tile.find('span', 'price price-display').text.strip()
data['Price'].append(price)
except:
data['Price'].append("")
#find the stars
try:
g_star = tile.find("div",{"class" : "stars stars-small tile-row"}).find('span','visuallyhidden').text.strip()
data['Stars'].append(g_star)
except:
data['Stars'].append("")
try:
dd_starring = tile.find('dd', {"class" : "media-details-multi-line media-details-artist-dd module"}).text.strip()
data['Starring'].append(dd_starring)
except:
data['Starring'].append("")
try:
running_time = tile.find_all('dl',{"class" : "media-details dl-horizontal copy-mini"})
for dd_run in running_time :
running = dd_run.find_all('dd')[1:2]
for run in running :
#print run.text.strip()
data['Running Time'].append(run.text.strip())
except:
data['Running Time'].append("")
try:
dd_format = tile.findAll('dd',{"class" :"media-details-multi-line"})[1:2]
for formatt in dd_format:
data['Format'].append(textOfFormat)
except:
data['Format'].append("")
try:
div_shipping =tile.find_all('div',{"data-offer-shipping-pass-eligible":"false"})
data['Shipping'].append("")
except:
freeshipping = "Free Shipping"
data['Shipping'].append(freeshipping)
df = pd.DataFrame(data)
df
如果没有类名,我想访问
如第11行有5个
目前我正在使用[2:1]等访问它..但它不是灵活的,并没有正确填充我的表。
执行此操作的任何功能?
答案 0 :(得分:1)
用:代替凝视和运行时间:
try:
dd_starring = tile.find('dd', {"class" : "media-details-artist-dd"}).text.strip()
data['Starring'].append(dd_starring)
except:
data['Starring'].append("")
try:
running = tile.find('dt',{'class':'media-details-running-time'})
running_time = running.find_next("dd")
data['Running Time'].append(running_time.text)
except:
data['Running Time'].append("")
这应该现在运行。看来当你用BeautifulSoup选择多个类时,它可能会混淆,所以你可以通过css类media-details-artist-dd
获得Actors。对于运行时间我采用了一个简单的技巧:)
编辑:更改代码以找到运行时间的dd
,然后获取下一个兄弟。前面的代码有一个额外不需要的部分
它应该现在可以使用