import bs4 as bs
import urllib.request
import re
sauce = urllib.request.urlopen('url').read()
soup = bs.BeautifulSoup(sauce, 'lxml')
print (soup.text)
test = soup.findAll (text = re.compile('risk'))
print (test)
我正在寻找一个段落中的特定单词“风险”。有人可以帮助我编写代码以检查该段落中是否存在该单词,如果存在,我只想在关键字前后提取6个单词。预先感谢。
答案 0 :(得分:1)
我认为此解决方案应该有效。如果字符串前后少于6个单词,这也会给您输出。而且它可以正确匹配“风险”,而不会匹配“风险”之类的东西。
您必须进行一些修改以匹配您的用例。
from bs4 import BeautifulSoup
import urllib.request
import re
url='https://www.investing.com/analysis/2-reasons-merck-200373488'
req = urllib.request.Request(
url,
data=None,
headers={
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
}
)
sauce = urllib.request.urlopen(req).read()
soup=BeautifulSoup(sauce,'html.parser')
pattern=re.compile(r'risk[\.| ]',re.IGNORECASE)#'Risk', 'risk.', 'risk' but NOT 'risky'
no_of_words=6
for elem in soup(text=pattern):
str=elem.parent.text
list=str.split(' ')
list_indices=[i for i,x in enumerate(list) if re.match(pattern,x.strip()+' ')]# +' ' to conform with our pattern
for index in list_indices:
start=index-no_of_words
end=index+no_of_words+1
if start<0:
start=0
print(' '.join(list[start:end]).strip()) #end will not affect o/p if > len(list)
print("List of Word Before: ",list[start:index])# words before
print("List of Words After: ",list[index+1:end])#word after
print()
输出
Risk Warning
List of Word Before: []
List of Words After: ['Warning']
Risk Disclosure:
List of Word Before: []
List of Words After: ['Disclosure:']
Risk Disclosure: Trading in financial instruments and/or
List of Word Before: []
List of Words After: ['Disclosure:', 'Trading', 'in', 'financial', 'instruments', 'and/or']
cryptocurrencies involves high risks including the risk of losing some, or all, of
List of Word Before: ['cryptocurrencies', 'involves', 'high', 'risks', 'including', 'the']
List of Words After: ['of', 'losing', 'some,', 'or', 'all,', 'of']
investment objectives, level of experience, and risk appetite, and seek professional advice where
List of Word Before: ['investment', 'objectives,', 'level', 'of', 'experience,', 'and']
List of Words After: ['appetite,', 'and', 'seek', 'professional', 'advice', 'where']
investment objectives, level of experience, and risk appetite, and seek professional advice where
List of Word Before: ['investment', 'objectives,', 'level', 'of', 'experience,', 'and']
List of Words After: ['appetite,', 'and', 'seek', 'professional', 'advice', 'where']
答案 1 :(得分:0)
这里有个简单的例子。请注意,如果在关键字之前/之后少于6个单词,则我没有考虑这种情况。但这为您提供了一般的开始/想法
from bs4 import BeautifulSoup
import requests
import re
key_word = 'risk'
url = 'https://www.investing.com/analysis/2-reasons-merck-200373488'
with requests.Session() as s:
s.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Accept-Encoding": "gzip, deflate",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en"
}
response = s.get(url)
soup = BeautifulSoup(response.text,"html.parser")
paragraphs = soup.findAll(text = re.compile(key_word))
if len(paragraphs) == 0:
print ('"%s" not found.' %(key_word))
else:
for paragraph in paragraphs:
#print (paragraph.strip())
alpha = paragraph.strip().split(' ')
try:
idx = alpha.index(key_word)
six_words = alpha[idx-6: idx] + alpha[idx: idx+7]
print (' '.join(six_words) + '\n')
except:
continue
输出:
cryptocurrencies involves high risks including the risk of losing some, or all, of
investment objectives, level of experience, and risk appetite, and seek professional advice where