我想在此处的所有链接上抓取公司的所有名称:
https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html
在每个链接中,都有几家公司,如下所示:
https://www.bilansgratuits.fr/classement/6420Z/default.html
我的目标是让所有这些公司拥有所有链接。
这是我目前的脚本:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
def clean_text(text):
text = tokenizer.tokenize(text)
final_text = ' '.join( [w for w in text] )
return final_text
url = 'https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html'
links = []
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
links = [a['href'] for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
names = []
root_url = 'https://www.bilansgratuits.fr/'
urls = [ '{root}{i}'.format(root=root_url, i=i) for i in links ]
for url in urls[:3]:
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
try:
name = [a.text for a in soup.find("div", {"class": "donnees"}).find_all('a', href=True)]
except:
name = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
names.append(name)
for i in range(0,3):
rx = re.compile(r'^\s+$')
names[i] = [item.split() for item in names[i] if not rx.match(item)]
data = pd.DataFrame({
'names' : names
})
data['names']= data['names'].apply(str)
data['names']= data['names'].apply(lambda x : clean_text(x))
print(data)
#data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')
我有这个输出:
但这不是我想要的,我希望每一行都有一个公司名称。
像这样:
对所有名称依此类推。
答案 0 :(得分:1)
这是你想要的吗?
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html"
html = requests.get(url).text
follow_urls = [
f"https://www.bilansgratuits.fr{anchor['href']}" for anchor
in BeautifulSoup(html, "html.parser").select(".titreElementAnnuaire a")
]
data = []
for follow_url in follow_urls:
print(f"Fetching: {follow_url}")
css_selector = ".titreElementAnnuaire a" if "6411Z" in follow_url else ".classementTop .blocRaisonSociale > a"
company_urls = BeautifulSoup(
requests.get(follow_url).text,
"html.parser",
).select(css_selector)
data.extend(
[
[
" ".join(anchor.getText(strip=True).split()),
f"https://www.bilansgratuits.fr{anchor['href']}",
] for anchor in company_urls
]
)
pd.DataFrame(data).to_csv("your_data.csv", index=False, header=["Company", "URL"])
print("Done!")
输出:.csv 文件中的 345 个条目:
答案 1 :(得分:0)
这是我的最终答案!
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re
import itertools
url = 'https://www.bilansgratuits.fr/secteurs/finance-assurance,k.html'
links = []
results = requests.get(url)
#time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
links = [a['href'] for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
secteur = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
secteurs = []
URLS = []
names = []
root_url = 'https://www.bilansgratuits.fr/'
urls = [ '{root}{i}'.format(root=root_url, i=i) for i in links ]
for url, secteur in zip(urls[:3], secteur[:3]):
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
try:
name = [a.text for a in soup.find("div", {"class": "donnees"}).find_all('a', href=True)]
for i in name:
URLS.append(url)
for i in name:
secteurs.append(secteur)
except:
name = [a.text for a in soup.find("div", {"class": "listeEntreprises"}).find_all('a', href=True)]
for i in name:
URLS.append(url)
for i in name:
secteurs.append(secteur)
names.append(name)
for i in range(0,3):
rx = re.compile(r'^\s+$')
names[i] = [item.split() for item in names[i] if not rx.match(item)]
res = []
for list in names:
for lis in list:
res.append(' '.join([w for w in lis]))
data = pd.DataFrame({
'names' : res,
'URL' : URLS,
'Secteur' : secteurs
})
data.to_csv('dftest.csv', sep=';', index=False, encoding = 'utf_8_sig')