代码:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import json
from os import listdir
res = requests.get('http://www.abcde.com/frontend/SearchParts')
soup = BeautifulSoup(res.text,"lxml")
href = [ a["href"] for a in soup.findAll("a", {"id" : re.compile("parts_img.*")})]
b1 =[]
for url in href:
b1.append("http://www.abcde.com"+url)
#print (b1)
b=[]
for i in range(len(b1)):
res2 = requests.get(b1[i]).text
soup2 = BeautifulSoup(res2,"lxml")
url_n=soup2.find('',rel = 'next')['href']
url_n=("http://www.abcde.com"+url_n)
#print(url_n)
b.append(b1[i])
b.append(url_n)
while True:
res3=requests.get(url_n).text
soup3 = BeautifulSoup(res3,"lxml")
try:
url_n=soup3.find('',rel = 'next')['href']
except TypeError:
break
if url_n:
url_n=("http://www.abcde.com"+url_n)
#print(url_n)
b.append(url_n)
all=[]
for url in b:
res = requests.get(url)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".article-title"):
all.append(urljoin('http://www.abcde.com',item['href']))
for urls in all:
re=requests.get(urls)
soup=BeautifulSoup(re.text.encode('utf-8'), "html.parser")
title_tag = soup.select_one('.page_article_title')
list=[]
for tag in soup.select('.page_article_content'):
list.append(tag.text)
list=([c.replace('\n', '') for c in list])
list=([c.replace('\r', '') for c in list])
list=([c.replace('\t', '') for c in list])
list=([c.replace(u'\xa0', u' ') for c in list])
list= (', '.join(list))
fruit_tag = soup.select_one('.authorlink')
fruit_final=None
if fruit_tag:
fruit_final= fruit_tag.text
else:
fruit_final= fruit_tag
keys=soup.findAll('div', style="font-size:1.2em;")
keys_final=None
list2=[]
if keys:
for key in keys:
list2.append(key.text)
list2=([c.replace('\n', '') for c in list2])
list2=([c.replace(' ', '') for c in list2])
list2= (', '.join(list2))
key_final=list2
else:
key_final=keys
if key_final==[]:
key_final=None
##################edit part####################################
data={
"Title" : title_tag.text,
"Registration": fruit_final,
"Keywords": key_final,
"Article": list
}
save_path= "C:/json/"
files=listdir(save_path)
file_name = save_path+'%s.json' % title_tag.text
with open(file_name, 'w',encoding='UTF-8') as f:
if file_name not in files:
file_d = json.dumps(data,ensure_ascii=False)
f.write(file_d)
else:
file_name = save_path +'%s_1.json' % title_tag.text
file_d = json.dumps(data,ensure_ascii=False)
f.write(file_d)
我抓了一个网页并将每篇文章的标题提取为title_tag.text。我发现有些文章的标题相同但网址/内容不同,所以我仍然需要将它们保存在我的目录中。现在我知道如果检查两个标题是否相同,我可以将其中一个命名为原始,另一个命名为original_1。但是,如果我需要保存4个具有相同标题的文件呢?在这种情况下如何做到这一点?提前谢谢!