Python:在文件夹中保存同名文件

时间:2017-10-18 09:32:43

标签: python json web-scraping duplicates overwrite

代码:

import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import json
from os import listdir


res = requests.get('http://www.abcde.com/frontend/SearchParts')
soup = BeautifulSoup(res.text,"lxml")
href = [ a["href"] for a in soup.findAll("a", {"id" : re.compile("parts_img.*")})] 
b1 =[]
for url in href:
    b1.append("http://www.abcde.com"+url)
#print (b1)  
b=[]
for i in range(len(b1)):
    res2 = requests.get(b1[i]).text
    soup2 = BeautifulSoup(res2,"lxml")
    url_n=soup2.find('',rel = 'next')['href']  
    url_n=("http://www.abcde.com"+url_n)
    #print(url_n)  

    b.append(b1[i]) 
    b.append(url_n)  
    while True:   
        res3=requests.get(url_n).text
        soup3 = BeautifulSoup(res3,"lxml")
        try:
            url_n=soup3.find('',rel = 'next')['href']  
        except TypeError:   
            break
        if url_n:
            url_n=("http://www.abcde.com"+url_n)
            #print(url_n)
            b.append(url_n)     
all=[]
for url in b:
    res = requests.get(url)
    soup = BeautifulSoup(res.text,"lxml")
    for item in soup.select(".article-title"):
        all.append(urljoin('http://www.abcde.com',item['href']))  
for urls in all:
    re=requests.get(urls)
    soup=BeautifulSoup(re.text.encode('utf-8'), "html.parser")
    title_tag = soup.select_one('.page_article_title')
    list=[]
    for tag in soup.select('.page_article_content'):
        list.append(tag.text)
    list=([c.replace('\n', '') for c in list])
    list=([c.replace('\r', '') for c in list])
    list=([c.replace('\t', '') for c in list])
    list=([c.replace(u'\xa0', u' ') for c in list])
    list= (', '.join(list))   
    fruit_tag = soup.select_one('.authorlink')
    fruit_final=None
    if fruit_tag:
        fruit_final= fruit_tag.text
    else:
        fruit_final= fruit_tag
    keys=soup.findAll('div', style="font-size:1.2em;")
    keys_final=None
    list2=[]
    if keys:
        for key in keys:
            list2.append(key.text)
        list2=([c.replace('\n', '') for c in list2])
        list2=([c.replace(' ', '') for c in list2])
        list2= (', '.join(list2))
        key_final=list2
    else:
        key_final=keys
        if key_final==[]:
            key_final=None

##################edit part####################################
data={
    "Title" : title_tag.text,
    "Registration": fruit_final,
    "Keywords": key_final,
    "Article": list
}


save_path= "C:/json/"   
files=listdir(save_path)
file_name = save_path+'%s.json' % title_tag.text
with open(file_name, 'w',encoding='UTF-8') as f:
    if file_name not in files:
        file_d = json.dumps(data,ensure_ascii=False)   
        f.write(file_d)
    else:
        file_name = save_path +'%s_1.json' % title_tag.text
        file_d = json.dumps(data,ensure_ascii=False)   
        f.write(file_d)

我抓了一个网页并将每篇文章的标题提取为title_tag.text。我发现有些文章的标题相同但网址/内容不同,所以我仍然需要将它们保存在我的目录中。现在我知道如果检查两个标题是否相同,我可以将其中一个命名为原始,另一个命名为original_1。但是,如果我需要保存4个具有相同标题的文件呢?在这种情况下如何做到这一点?提前谢谢!

0 个答案:

没有答案