import requests
from bs4 import BeautifulSoup
import pandas as pd
import pdfkit
import re
URL = 'https://timesofindia.indiatimes.com/'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'lxml')
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'lxml')
all_links=set()
for link in soup.find_all('a'):
all_links.add(link.get('href'))
s = list(all_links)
print(s)
x=[i for i in s if i._contains_(URL)]
m=[]
find_words= ['cbse', 'first-day']
for s in x:
if any(f in s for f in find_words):
m.append(s)
print(m)
答案 0 :(得分:0)
您的包含行无效。
尝试
x=[i for i in s if URL in i]
答案 1 :(得分:0)
我正在尝试使用不同的网址跟踪上述问题的代码
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pdfkit
import re
URL = 'https://timesofindia.indiatimes.com'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'lxml')
#page = requests.get(URL)
#soup = BeautifulSoup(page.content, 'lxml')
all_links=set()
for link in soup.find_all('a'):
all_links.add(link.get('href'))
s = list(all_links)
#print(s)
a=[i for i in s if "http" in i]
b=[i for i in s if "http" not in i]
s1=URL+'/'
#print(s1)
if len(b) > 0:
c=list(map(s1.__add__,b))
x=a+c
#print(x)
m=[]
find_words= ['school', 'army']
find_words = [item.lower() for item in find_words]
#print(find_words)
for s in x:
if any(f in s for f in find_words):
m.append(s)
if len(m) > 0 :
print(m)
else:
print("No Matching Words Found")
有时它可以正常工作,有时会出现以下错误
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-56-5c34031de8f2> in <module>
18 s = list(all_links)
19 #print(s)
---> 20 a=[i for i in s if "http" in i]
21 b=[i for i in s if "http" not in i]
22 s1=URL+'/'
<ipython-input-56-5c34031de8f2> in <listcomp>(.0)
18 s = list(all_links)
19 #print(s)
---> 20 a=[i for i in s if "http" in i]
21 b=[i for i in s if "http" not in i]
22 s1=URL+'/'
TypeError: argument of type 'NoneType' is not iterable
不明白为什么?您能帮我做错什么以及如何纠正吗?
谢谢