我需要从API中检索带有行和列数据的html表数据,并将其填充到其他团队。
import requests
import json
import html2text
#from bs4 import BeautifulSoup
headers = {
'Authorization': 'Bearer hmy0w2ltszfxeysnq8cbjzfcyr4kzfk5k9a0vfca.t',
'Content-Type': 'application/json',
}
data = '{}'
response = requests.get('https://sandbox.jiveon.com/api/core/v3/contents/436669', headers=headers, data=data)
data = response.json()
print (data['content']['text'])
将其转换为文字
format = html2text.HTML2Text()
format.ignore_links = True
format.bypass_tables = False
#format.ignore_tables = True
format.wrap_links = True
format.ignore_images = True
format.ignore_emphasis = True
format.wrap_links = True
print (format.handle(data['content']['text']))
以上代码段的输出为:
<body><!-- [DocumentBodyStart:756f88b6-eed4-4030-ada9-f74dc8e4418b] --><div class="jive-rendered-content"><p>DB Release </p><p style="min-height: 8pt; padding: 0px;"> </p><div class="j-rte-table"><table class="j-table jiveBorder" style="border: 1px solid #c6c6c6;" width="100%"><thead><tr style="background-color: #efefef;"><th style="width: 11%;">Release Version</th><th style="width: 10%;">REFDB_ID</th><th style="width: 160%;">SVN URL</th></tr></thead><tbody><tr><td style="width: 11%;">3.7.3</td><td style="width: 10%;"><p style="background-color: #ffffff; border: 0px; padding: 0px;">3710002</p><p style="background-color: #ffffff; border: 0px; padding: 0px;">3710003 <br/>3710005 <br/>3710007 <br/>3710009<br/>3710011</p></td><td style="width: 160%;"><p style="background-color: #ffffff; border: 0px; padding: 0px;"><a class="jive-link-external-small" href="http://svnurl.com" rel="nofollow">http://svnurl1.com </a></p><p style="background-color: #ffffff; border: 0px; padding: 0px;"><a class="jive-link-external-small" href="http://svnurl2.com" rel="nofollow">http://svnurl2.com</a></p></td></tr></tbody></table></div></div><!-- [DocumentBodyEnd:756f88b6-eed4-4030-ada9-f74dc8e4418b] --></body>
DB Release
Release Version| REFDB_ID| SVN URL
---|---|---
3.7.3|
3710002
3710003
3710005
3710007
3710009
3710011
|
http://svnurl1.com
http://svnurl2.com
答案 0 :(得分:0)
我得到的解决方案将根据命令行参数过滤掉数据。
import requests
import json
import sys
from bs4 import BeautifulSoup
from sys import argv
from xml.etree import ElementTree as ET
headers = {
'Authorization': 'Bearer hmy0w2ltszfxeysnq8cbjzfcyr4kzfk5k9a0vfca.t',
'Content-Type': 'application/json',
}
data = '{}'
response = requests.get('https://sandbox.jiveon.com/api/core/v3/contents/436669', headers=headers, data=data)
data = response.json()
html_doc = data['content']['text']
soup = BeautifulSoup(html_doc, 'html.parser')
mytag = []
mydata = []
finaldata = []
table = soup.findAll('tr')
for val in table:
trdata = BeautifulSoup(str(val),'html.parser')
if '3.7.4' in str(trdata):
mytag = trdata.findAll('td')
for val in mytag:
mydata.append(val.get_text())
for val in mydata:
if str(val).startswith('http:'):
urldata = str(val).split('.com')
for val in urldata:
if val:
finaldata.append("".join([str(val), '.com']))
else:
finaldata.append(val)
for val in finaldata:
print (val)