我如何从以下数据中抓取数据,以将公司,名称,地址,城市,邮政编码,电话,电子邮件,网站作为不同的列?从 https://directory.justice.org/SearchResult.asp?access=public&firstmiddlename=&middlename=&lastname=&maidenname=&firmname=&city=&provstateid=&zip=&countryid=&keyword=&areaofpractice=&areaofpractice2=Personal+Injury§iontype=&memtype=&sb=&gender=Any
我想在各个br标签下将律师的详细信息拆分为单独的实体,在找到律师详细信息后我被卡住了,我如何将每个br标签分配给类似于名称的值?地址等?
import pandas as pd
from bs4 import BeautifulSoup, Tag
import requests
import re
data =[]
res=requests.get("https://directory.justice.org/SearchResult.asp?access=public&firstmiddlename=&middlename=&lastname=&maidenname=&firmname=&city=&provstateid=&zip=&countryid=&keyword=&areaofpractice=&areaofpractice2=Personal+Injury§iontype=&memtype=&sb=&gender=Any")
soup=BeautifulSoup(res.text,'lxml')
lawyers=soup.findAll('div',{'style':'float:left'})
答案 0 :(得分:2)
尝试一下:
from bs4 import BeautifulSoup, Tag, NavigableString
import pandas as pd
import requests
res=requests.get("https://directory.justice.org/SearchResult.asp?access=public&firstmiddlename=&middlename=&lastname=&maidenname=&firmname=&city=&provstateid=&zip=&countryid=&keyword=&areaofpractice=&areaofpractice2=Personal+Injury§iontype=&memtype=&sb=&gender=Any")
soup=BeautifulSoup(res.text,'lxml')
lawyers=soup.findAll('div',{'style':'float:left'})
roster = []
for law in lawyers:
data = []
for item in law:
if isinstance(item, Tag) and len(item.text.strip())>0:
data.append(item.text.strip())
if isinstance(item, NavigableString):
data.append(item.strip())
roster.append(data)
df = pd.DataFrame(roster)
df.head()
答案 1 :(得分:0)
import pandas as pd
from bs4 import BeautifulSoup, Tag
import requests
import re
data=[]
res=requests.get("https://directory.justice.org/SearchResult.asp?access=public&firstmiddlename=&middlename=&lastname=&maidenname=&firmname=&city=&provstateid=&zip=&countryid=&keyword=&areaofpractice=&areaofpractice2=Personal+Injury§iontype=&memtype=&sb=&gender=Any")
soup=BeautifulSoup(res.text,'lxml')
lawyer=soup.findAll('div',{'style':'float:left'})
for item in lawyer:
lawyer_company=(item.contents[0].text)
lawyer_name=(item.contents[2])
lawyer_address=(item.contents[4])
lawyer_city=(item.contents[6])
lawyer_state=(item.contents[6])
lawyer_zip=(item.contents[6])
lawyer_phone=(item.contents[8])
lawyer_email=(item.contents[11])
if isinstance(lawyer_email, Tag):
lawyer_email=lawyer_email.text.strip()
lawyer_website=(item.contents[13])
if isinstance(lawyer_website, Tag):
lawyer_website=lawyer_website.text.strip()
full_dict={'Company':lawyer_company, 'Name':lawyer_name,'Address':lawyer_address,'City':lawyer_city,'State':lawyer_state,'Zip':lawyer_zip,'Phone':lawyer_phone,'Email':lawyer_phone,'Website':lawyer_website}
data.append(full_dict)
df=pd.DataFrame(data)
print(df)