我想从包含url列表的文件中删除重复的url。我的bugun_url_given.txt有“http://www.bugun.com.tr/ara/Ak%20Parti/1”,它会获取所有网址,并且正在重复... 它将所有唯一的URL保存在“bugun_url_collection.tx”中 这是我的代码:
from cookielib import CookieJar
import urllib2
import json
from bs4 import BeautifulSoup
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
try:
text_file = open('bugun_url_given.txt', 'r')
for line in text_file:
print line
soup = BeautifulSoup(opener.open(line))
links = soup.select('div.nwslist a')
for link in links:
print link
#unique_url = set(map(lambda url : url.strip("/ "), links))
with open('bugun_url_collection.txt', 'a') as f:
for link in links:
f.write(link.get('href') + '\n')
except ValueError:
pass
答案 0 :(得分:2)
for link in links:
f.write(link.get('href') + '\n')
可以成为
for link in set(link.get('href') for link in links):
f.write(link + '\n')
在回复评论(这是正确的)时,让我们正确地重写:
from cookielib import CookieJar
import urllib2
import json
from bs4 import BeautifulSoup
cj = CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
def write_links_to_file(links):
with open('bugun_url_collection.txt', 'a') as f:
f.writeline(link)
def get_links_from_file(text_file):
for line in text_file:
print line
soup = BeautifulSoup(opener.open(line))
links = soup.select('div.nwslist a')
for link in links:
yield link.get('href')
with open('bugun_url_given.txt', 'r') as text_file:
links = get_links_from_file(text_file)
unique_links = set(link for link in links)
write_links_to_file(unique_links)
答案 1 :(得分:0)
你可以做到
hrefs = []
for link in links:
print link
hrefs.append(link.get('href'))
hrefs = list(set(hrefs))
with open('bugun_url_collection.txt', 'a') as f:
f.write('\n'.join(hrefs))
答案 2 :(得分:0)
您应该将生成链接的代码与保存它们的代码分开:
def generate_urls(filename, urlopen):
with open(filename) as file:
for line in file:
soup = BeautifulSoup(urlopen(line.strip()))
for link in soup.select('div.nwslist a[href^="http"]'):
yield link['href']
links = set(generate_urls('bugun_url_given.txt', opener.open))
with open('bugun_url_collection.txt', 'w') as file:
file.write("\n".join(links))
答案 3 :(得分:0)
您嵌套了for
个循环,因此您正在遍历len(links)
次链接。
links = soup.select('div.nwslist a')
for link in links:
...
with open('bugun_url_collection.txt', 'a') as f:
for link in links:
f.write(link.get('href') + '\n')
你真正想要的是:
with open('bugun_url_given.txt', 'r') as text_file, text_file = open('bugun_url_given.txt', 'r'):
for line in text_file:
print line
soup = BeautifulSoup(opener.open(line))
links = set(link for link in soup.select('div.nwslist a'))
for link in links:
print link
#unique_url = set(map(lambda url : url.strip("/ "), links))
f.write(link.get('href') + '\n')