我有来自HTML源代码文件的以下原始数据
{$deletedFields:[courses,projects,description,degreeName,recommendations,honors,entityLocale,activities,grade,fieldOfStudyUrn,testScores,degreeUrn],entityUrn:urn:li:fs_education:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,75863717),school:urn:li:fs_miniSchool:11709,timePeriod:urn:li:fs_education:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,75863717),timePeriod,schoolName:Charles University in Prague,fieldOfStudy:Economics, Politics,schoolUrn:urn:li:fs_miniSchool:11709,$type:com.linkedin.voyager.identity.profile.Education},
{$deletedFields:[courses,projects,description,recommendations,honors,entityLocale,activities,grade,fieldOfStudyUrn,testScores,degreeUrn],entityUrn:urn:li:fs_education:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,26812055),school:urn:li:fs_miniSchool:17888,timePeriod:urn:li:fs_education:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,26812055),timePeriod,degreeName:BA,schoolName:Occidental College,fieldOfStudy:Economics,schoolUrn:urn:li:fs_miniSchool:17888,$type:com.linkedin.voyager.identity.profile.Education},
{$deletedFields:[],profileId:ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,elements:[urn:li:fs_education:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,26812055),urn:li:fs_education:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,75863717)],paging:urn:li:fs_profileView:ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,educationView,paging,$type:com.linkedin.voyager.identity.profile.EducationView,$id:urn:li:fs_profileView:ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,educationView},
{$deletedFields:[],start:501,end:1000,$type:com.linkedin.voyager.identity.profile.EmployeeCountRange,$id:urn:li:fs_position:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,323432440),company,employeeCountRange}
{$deletedFields:[month,day],year:2007,$type:com.linkedin.common.Date,$id:urn:li:fs_education:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,75863717),timePeriod,startDate},
{$deletedFields:[month,day],year:2004,$type:com.linkedin.common.Date,$id:urn:li:fs_education:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,26812055),timePeriod,startDate},
{$deletedFields:[month,day],year:2008,$type:com.linkedin.common.Date,$id:urn:li:fs_education:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,26812055),timePeriod,endDate},
{$deletedFields:[month,day],year:2007,$type:com.linkedin.common.Date,$id:urn:li:fs_education:(ACoAAAIUJvgBC7QTHSmLEjgtomzxvwceeM71E1c,75863717),timePeriod,endDate},
我需要的是使用它从中提取一些数据。
schoolname = re.findall(r',schoolname:(.*?),' , page_html)
fieldofstudy = skills = re.findall(r'fieldOfStudy:(.*?),s' , page_html)
degreename = re.findall(r'degreeName:(.*?),' , page_html)
schoolName:布拉格查理大学
fieldOfStudy:Economics,Politics
开始:2007年
结束:2007
schoolName:Occidental College
fieldOfStudy:经济学
degreeName:BA
开始:2004年
结束:2008
答案 0 :(得分:0)
问题:我需要的是使用
从中提取一些数据
定义数据容器class School
:
class School(object):
def __init__(self, raw_data):
key = None
year = '?'
for kv in raw_data:
i = kv.find(':')
if i >= 0:
key = kv[0:i]
value = kv[i + 1:]
if key in ['schoolName', 'fieldOfStudy', 'startDate', 'endDate', 'degreeName']:
object.__setattr__(self, key, value)
if key in ['year']:
year = value
else:
if key in ['entityUrn', '$id']:
if kv[:-1].isdigit():
self.entity = kv[:-1]
elif key in ['fieldOfStudy']:
self.fieldOfStudy += ', '+kv
elif kv in ['startDate', 'endDate']:
object.__setattr__(self, kv, year)
key = ''
if not hasattr(self, 'degreeName'):
self.degreeName = 'unknown'
def __repr__(self):
return "entity:\t\t{s.entity:>28}\n" \
"schoolName:\t{s.schoolName:>28}\n" \
"fieldOfStudy:{s.fieldOfStudy:>27}\n" \
"degreeName:\t{s.degreeName:>28}\n" \
"startDate:\t{s.startDate:>28}\n" \
"endDate:\t{s.endDate:>28}\n".format(s=self)
逐行阅读文件:
with open('<path to file>') as fh:
degreeUrn = {}
for line in fh:
match = re.findall(r'\{(.*?)\:\[(.*?)\],(.*)\}', line)
m2 = match[0][2].split(',')
school = School(m2)
if hasattr(school, 'entity'):
if hasattr(school, 'startDate'):
degreeUrn[school.entity].startDate = school.startDate
del school
elif hasattr(school, 'endDate'):
degreeUrn[school.entity].endDate = school.endDate
del school
elif hasattr(school, 'schoolName'):
degreeUrn[school.entity] = school
else:
del school
for entity in degreeUrn:
print(degreeUrn[entity])
<强>输出强>:
entity: 75863717 schoolName: Charles University in Prague fieldOfStudy: Economics, Politics degreeName: unknown startDate: 2007 endDate: 2007 entity: 26812055 schoolName: Occidental College fieldOfStudy: Economics degreeName: BA startDate: 2004 endDate: 2008
使用Python测试:3.4.2