我有一个代码从特定网页中删除所有内容,我现在想要构建一个可以帮助我了解具体细节的代码,例如,如果我输入样式ID,它应该给我相关的详细信息它,或者如果我输入类别,它应该给我这个类别中的所有项目及其详细信息。我的代码是: -
import requests, re
from bs4 import BeautifulSoup
url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2"
r=requests.get(url)
soup=BeautifulSoup(r.content)
links=soup.find_all("a")
img=soup.find(itemprop="image")
g_d4=soup.find_all("ol", {"class":"breadcrumb"})
for item in g_d4:
links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/'))
pattern_2=re.compile("clothing/(\w+)")
for link in links_2:
match_1=pattern_2.search(link["href"])
if match_1:
print ("Category:- " + match_1.group(1))
break
g_d1 = soup.find_all("div", {"id": "product-content"})
for item in g_d1:
try:
print ("\n\nBRAND:-" + item.contents[1].text)
except:
pass
try:
a_1=item.find("ol", {"class":"breadcrumb"})
a_2=a_1.text
print a_2
except:
pass
try:
print ("TYPE:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';')
except:
pass
try:
d2=item.find("div",{"class":"panel-body standard-p"})
d3=d2.text
p_id=re.findall(r'[0-9]{9}',d3)
id_2=p_id[0]
url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7Citem_page.rr1%7Citem_page.featured_item_0%7Citem_page.featured_item_1%7Citem_page.featured_item_2%7Citem_page.featured_item_3&u=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&s=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&cts=http%3A%2F%2Fwww.barneys.com&chi=%7Cmens-shirts-dress-classic&flv=18.0.0&rcs=eF4NyjEOgCAMBdCFybs0obQfyg28BhRIHNzU88v68sJxf881TDUSq6hYTimWomRGm9gkh9fPZo21olN3qbT3ogUYOcATzpgRP7a2EmY&l=1'
r_1= requests.get(url_1)
pattern = re.compile(r'(?<=p=)[0-9]+(?=&)')
product_ids = pattern.findall(str(r_1.content))
print ("DETAILS:- " + d3+';')
print ("\nStyle ID:- " + id_2+';')
print ("\nRecommended Product ID's:- ")
print (','.join(i for i in product_ids))
except:
pass
try:
print ("\nURL:-" + img["src"]+';')
except:
pass
try:
print ("\nFull Price:-" + item.find("span",{"class":"price-standard"}).text+';')
except:
pass
try:
print ("\nDiscounted Price:-" + item.find("span",{"class":"price-sales"}).text+';')
except:
pass
g_d2=soup.find_all("div", {"class":"color-scroll"})
pattern_1=re.compile("pid=(\w+)")
for item in g_d2:
links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/Sites-BNY-Site/default/Product-Variation'))
for link in links_1[1:]:
match=pattern_1.search(link["href"])
if match:
print ("\nProduct ID of other color:-")
print (match.group(1))
答案 0 :(得分:0)
我添加了一个名为d
import requests, re
from bs4 import BeautifulSoup
d={}
url="http://www.barneys.com/theory-andrejs-sweater-503900006.html#start=2"
r=requests.get(url)
soup=BeautifulSoup(r.content)
links = soup.find_all("a")
d["links"] = []
d["links"].append(("href", [link.get("href") for link in links]))
d["links"].append(("class", [link.get("class") for link in links]))
img=soup.find(itemprop="image")
d["img"] = []
d["img"].append([("alt", img.get("alt")), ("src", img.get("src")), ("itemprop", img.get("itemprop")), ("class", img.get("class")[0])]) #You will have to put d["img"]["0"] instead of d["img"]["alt"]
g_d4=soup.find_all("ol", {"class":"breadcrumb"})
for item in g_d4:
links_2=soup.find_all('a', href=re.compile('^http://www.barneys.com/barneys-new-york/men/'))
pattern_2=re.compile("clothing/(\w+)")
for link in links_2:
match_1=pattern_2.search(link["href"])
if match_1:
print ("Category:- " + match_1.group(1))
break
g_d1 = soup.find_all("div", {"id": "product-content"})
for item in g_d1:
try:
d["Brand"] = item.contents[1].text
print ("\n\nBRAND:-" + item.contents[1].text)
except:
pass
try:
a_1=item.find("ol", {"class":"breadcrumb"})
a_2=a_1.text
d["a_2"] = a_2
print a_2
except:
pass
try:
print ("TYPE:-" + item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text+';')
d["Type"] = item.find("h1",{"class":"product-name"},{"itemprop":"name"}).text
except:
pass
try:
d2=item.find("div",{"class":"panel-body standard-p"})
d3=d2.text
p_id=re.findall(r'[0-9]{9}',d3)
id_2=p_id[0]
url_1 = 'http://recs.richrelevance.com/rrserver/p13n_generated.js?a=dbeab3c977a08905&ts=1434386243747&p='+str(id_2)+'&pt=%7Citem_page.rr1%7Citem_page.featured_item_0%7Citem_page.featured_item_1%7Citem_page.featured_item_2%7Citem_page.featured_item_3&u=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&s=mVBBR9wkG1PJ7zehLfmNXwzRp4WGMeDLG4M%3D&cts=http%3A%2F%2Fwww.barneys.com&chi=%7Cmens-shirts-dress-classic&flv=18.0.0&rcs=eF4NyjEOgCAMBdCFybs0obQfyg28BhRIHNzU88v68sJxf881TDUSq6hYTimWomRGm9gkh9fPZo21olN3qbT3ogUYOcATzpgRP7a2EmY&l=1'
r_1= requests.get(url_1)
pattern = re.compile(r'(?<=p=)[0-9]+(?=&)')
product_ids = pattern.findall(str(r_1.content))
print ("DETAILS:- " + d3+';')
d["Details"] = d3.split(",")
print ("\nStyle ID:- " + id_2+';')
d["Style"] = ("ID", id_2)
print ("\nRecommended Product ID's:- ")
print (','.join(i for i in product_ids))
d["RecommendedProductIDs"] = [i for i in product_ids]
except:
pass
try:
print ("\nURL:-" + img["src"]+';')
except:
pass
try:
print ("\nFull Price:-" + item.find("span",{"class":"price-standard"}).text+';')
except:
pass
try:
print ("\nDiscounted Price:-" + item.find("span",{"class":"price-sales"}).text+';')
except:
pass
g_d2=soup.find_all("div", {"class":"color-scroll"})
pattern_1=re.compile("pid=(\w+)")
for item in g_d2:
links_1=soup.find_all('a', href=re.compile('^/on/demandware.store/Sites-BNY-Site/default/Product-Variation'))
for link in links_1[1:]:
match=pattern_1.search(link["href"])
if match:
print ("\nProduct ID of other color:-")
print (match.group(1))