我正在尝试提取将具有下面粘贴的代码结构的数据,并且从这些数据中我想提取每个10v,20V的X和y ......并将它们分开。在解析我将拥有的数据列表之后,我最终计划生成每个图的绘图。
汤:
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-72511212-1', 'auto');
ga('send', 'pageview');
</script>
<!DOCTYPE html>
<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>Metabolite Chart</title>
<!-- 1. Add these JavaScript inclusions in the head of your page -->
<script src="lib/js/jquery-1.6.x/jquery-1.6.1.min.js" type="text/javascript"></script>
<script src="lib/js/misc/highcharts.src.delta.js" type="text/javascript"></script>
<!--<script type="text/javascript" src="/lib/highcharts.js"></script>-->
<script src="lib/js/misc/excanvas.compiled.js" type="text/javascript"></script>
<!--
<script src="https://code.highcharts.com/highcharts.js"></script>
<script src="https://code.highcharts.com/modules/exporting.js"></script>
-->
<!-- 2. Add the JavaScript to initialize the chart on document ready -->
<script type="text/javascript">
//alert("molI: " + "203");
//alert("molN: " + "Chenodeoxycholic acid glycine conjugate");
$(document).ready(function() {
//alert("molN: " + "Chenodeoxycholic acid glycine conjugate");
// function resetchart() {
// fireEvent(chart, 'selection', { resetSelection: true }, zoom);
// }
var count = 0;
//alert("molI: " + "203");
//var mid = 203; // Pass MID here!
var mid = "203"; // Pass
var mole = "Chenodeoxycholic acid glycine conjugate"; // Pass molecule name here!
var chart = new Highcharts.Chart({
chart: {
renderTo: 'container',
defaultSeriesType: 'column',
zoomType: 'xy',
margin: [50, 50, 200, 80]
},
title: {
text: '' + mole
},
subtitle: {
text: "MID: 203 <font color='blue'><b>Insilico predicted spectra<\/b><\/font>" },
credits: {
enabled: false
},
xAxis: {
min: 0,
// max: 200,
title: {
enabled: true,
text: 'Mass (m/z)'
},
maxZoom: 0.1,
tickPixelInterval: 100
},
yAxis: {
min: 0,
max: 100,
title: {
text: 'Intensity (%)'
}
},
legend: {
enabled: true,
showFragments: true,
showNeutrals: false,
showPeaks: false,
exclusiveSelect: true, // Turns on exclusive radio style buttons
dblClick: false,
startNumber: 0, // The default legend item when page loads
borderWidth: 1,
layout: 'vertical',
backgroundColor: '#FFFFFF',
style: {
left: '50px',
top: '300px',
bottom: 'auto'
}
},
// Tooltip HTML
tooltip: {
second: true,
neutral: false,
borderRadius: 0,
formatter: function() {
var namestr;
if (this.series.name.match(/\+/g) && !this.series.name.match("Cl"))
namestr = "Mode: <b><font size=\"4\">(+)</font></b> Collision Energy: ";
else if (this.series.name.match('-'))
namestr = "Mode: <b><font size=\"4\">(-)</font></b> Collision Energy: ";
if (!(this.series.name.match("10 V")||this.series.name.match("20 V")||this.series.name.match("40 V")))
namestr += "<b><font size=\"3\">0 V</font></b>";
else if (this.series.name.match("10 V"))
namestr += "<b><font size=\"3\">10 V</font></b>";
else if (this.series.name.match("20 V"))
namestr += "<b><font size=\"3\">20 V</font></b>";
else if (this.series.name.match("40 V"))
namestr += "<b><font size=\"3\">40 V</font></b>";
return '<center><br/> '+ namestr +'<br/>' + ' m/z: <b><font size="3">' + this.x.toFixed(4) + '</font></b> Intensity: <b><font size="3">' + parseInt(Math.abs(this.y)) + ' % </font></b></center><br/>';
},
formatter2: function() {
var namestr;
if (this.series.name.match(/\+/g))
namestr = "Mode: (+), Collision Energy: ";
else if (this.series.name.match('-'))
namestr = "Mode: (-), Collision Energy: ";
if (!(this.series.name.match("10 V")||this.series.name.match("20 V")||this.series.name.match("40 V")))
namestr += "0 V, Adduct: ";
else if (this.series.name.match("10 V"))
namestr += "10 V, Adduct: ";
else if(this.series.name.match("20 V"))
namestr += "20 V, Adduct: ";
else if (this.series.name.match("40 V"))
namestr += "40 V, Adduct: ";
return false;
}
},
plotOptions: {
column: {
pointPadding: 0.53,
// pointPadding: 0.99,
borderWidth: 0,
shadow: false
// borderColor: '#000000'
}
},
series: [{name: ' (+) 10 V [M+H]+ ',data:[{x:450.321,y:84,fragment: false},{x:434.29,y:0,fragment: false},{x:432.311,y:8,fragment: false},{x:432.311,y:100,fragment: false},{x:416.28,y:0,fragment: false},{x:414.3,y:24,fragment: false},{x:406.332,y:0,fragment: false},{x:404.316,y:16,fragment: false},{x:390.3,y:0,fragment: false},{x:390.264,y:0,fragment: false},{x:388.321,y:0,fragment: false},{x:386.305,y:8,fragment: false},{x:375.289,y:24,fragment: false},{x:372.29,y:0,fragment: false},{x:357.279,y:12,fragment: false},{x:347.294,y:4,fragment: false},{x:331.3,y:0,fragment: false},{x:329.284,y:4,fragment: false},{x:319.263,y:0,fragment: false},{x:301.253,y:0,fragment: false},{x:291.232,y:0,fragment: false},{x:273.221,y:0,fragment: false},{x:240.159,y:0,fragment: false},{x:158.081,y:0,fragment: false},{x:130.05,y:0,fragment: false},{x:76.0393,y:44,fragment: false},{x:74.0237,y:0,fragment: false},{x:59.0128,y:8,fragment: false},{x:58.0287,y:0,fragment: false},{x:30.0338,y:0,fragment: false},{x:28.0182,y:0,fragment: false} ]},{name: ' (+) 20 V [M+H]+ ',data:[{x:450.321,y:11.764705882353,fragment: false},{x:434.29,y:0,fragment: false},{x:432.311,y:35.294117647059,fragment: false},{x:432.311,y:5.8823529411765,fragment: false},{x:416.28,y:5.8823529411765,fragment: false},{x:414.3,y:58.823529411765,fragment: false},{x:404.316,y:11.764705882353,fragment: false},{x:388.321,y:5.8823529411765,fragment: false},{x:386.305,y:35.294117647059,fragment: false},{x:375.289,y:29.411764705882,fragment: false},{x:372.29,y:5.8823529411765,fragment: false},{x:357.279,y:35.294117647059,fragment: false},{x:347.294,y:17.647058823529,fragment: false},{x:333.279,y:5.8823529411765,fragment: false},{x:331.3,y:5.8823529411765,fragment: false},{x:329.284,y:17.647058823529,fragment: false},{x:327.268,y:5.8823529411765,fragment: false},{x:319.263,y:0,fragment: false},{x:315.268,y:5.8823529411765,fragment: false},{x:301.253,y:5.8823529411765,fragment: false},{x:273.221,y:5.8823529411765,fragment: false},{x:158.081,y:5.8823529411765,fragment: false},{x:130.05,y:5.8823529411765,fragment: false},{x:111.08,y:5.8823529411765,fragment: false},{x:102.019,y:0,fragment: false},{x:76.0393,y:100,fragment: false},{x:74.0237,y:5.8823529411765,fragment: false},{x:59.0128,y:23.529411764706,fragment: false},{x:58.0287,y:17.647058823529,fragment: false},{x:30.0338,y:11.764705882353,fragment: false},{x:28.0182,y:11.764705882353,fragment: false} ]},{name: ' (+) 40 V [M+H]+ ',data:[{x:416.28,y:18.181818181818,fragment: false},{x:414.3,y:100,fragment: false},{x:388.321,y:18.181818181818,fragment: false},{x:386.305,y:54.545454545455,fragment: false},{x:372.29,y:9.0909090909091,fragment: false},{x:359.294,y:9.0909090909091,fragment: false},{x:357.279,y:63.636363636364,fragment: false},{x:355.263,y:9.0909090909091,fragment: false},{x:333.279,y:0,fragment: false},{x:331.3,y:18.181818181818,fragment: false},{x:331.263,y:9.0909090909091,fragment: false},{x:329.284,y:36.363636363636,fragment: false},{x:327.268,y:9.0909090909091,fragment: false},{x:317.284,y:9.0909090909091,fragment: false},{x:315.268,y:18.181818181818,fragment: false},{x:303.268,y:9.0909090909091,fragment: false},{x:301.253,y:18.181818181818,fragment: false},{x:275.237,y:9.0909090909091,fragment: false},{x:273.221,y:18.181818181818,fragment: false},{x:261.221,y:0,fragment: false},{x:111.08,y:9.0909090909091,fragment: false},{x:97.0648,y:9.0909090909091,fragment: false},{x:76.0393,y:45.454545454545,fragment: false},{x:59.0128,y:63.636363636364,fragment: false},{x:58.0287,y:81.818181818182,fragment: false},{x:55.0542,y:9.0909090909091,fragment: false},{x:44.9971,y:9.0909090909091,fragment: false},{x:41.0022,y:18.181818181818,fragment: false},{x:32.0495,y:9.0909090909091,fragment: false},{x:30.0338,y:36.363636363636,fragment: false},{x:28.0182,y:54.545454545455,fragment: false} ]}]
});
});
</script>
</head>
<body style="border:0;overflow:visible">
<!-- 3. Add the container -->
<div id="container" style="width: 720px; height: 460px; margin: 0 auto">
</div>
<!-- <table align = "center" style="border-width:0; cellpadding:5; table-layout:fixed; bordercolor:'#00FF00'"> -->
<table align="center" style="border-width:0; cellpadding:5; table-layout:fixed; bordercolor:'#00FF00'">
<tr>
<td style="border-style: solid; border-color:#FFF8C6"><img align="top" alt="attention" src="img/attn.png" title="how to use spectrum"/>
<font color="red" face="helvetica,arial" size="2">
<b>Please mouse over the spectrum to view the detail information of each peak<br/>
Use left mouse button to zoom in (click and drag) and zoom out (double-click)</b></font>
</td>
</tr>
</table>
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-1907670-5']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
</body>
</html>
我想要提取的数据:
series: [{name: ' (+) 10 V [M+H]+ ',data:[{x:450.321,y:84,fragment: false},{x:434.29,y:0,fragment: false},{x:432.311,y:8,fragment: false},{x:432.311,y:100,fragment: false},{x:416.28,y:0,fragment: false},{x:414.3,y:24,fragment: false},{x:406.332,y:0,fragment: false},{x:404.316,y:16,fragment: false},{x:390.3,y:0,fragment: false},{x:390.264,y:0,fragment: false},{x:388.321,y:0,fragment: false},{x:386.305,y:8,fragment: false},{x:375.289,y:24,fragment: false},{x:372.29,y:0,fragment: false},{x:357.279,y:12,fragment: false},{x:347.294,y:4,fragment: false},{x:331.3,y:0,fragment: false},{x:329.284,y:4,fragment: false},{x:319.263,y:0,fragment: false},{x:301.253,y:0,fragment: false},{x:291.232,y:0,fragment: false},{x:273.221,y:0,fragment: false},{x:240.159,y:0,fragment: false},{x:158.081,y:0,fragment: false},{x:130.05,y:0,fragment: false},{x:76.0393,y:44,fragment: false},{x:74.0237,y:0,fragment: false},{x:59.0128,y:8,fragment: false},{x:58.0287,y:0,fragment: false},{x:30.0338,y:0,fragment: false},{x:28.0182,y:0,fragment: false} ]},{name: ' (+) 20 V [M+H]+ ',data:[{x:450.321,y:11.764705882353,fragment: false},{x:434.29,y:0,fragment: false},{x:432.311,y:35.294117647059,fragment: false},{x:432.311,y:5.8823529411765,fragment: false},{x:416.28,y:5.8823529411765,fragment: false},{x:414.3,y:58.823529411765,fragment: false},{x:404.316,y:11.764705882353,fragment: false},{x:388.321,y:5.8823529411765,fragment: false},{x:386.305,y:35.294117647059,fragment: false},{x:375.289,y:29.411764705882,fragment: false},{x:372.29,y:5.8823529411765,fragment: false},{x:357.279,y:35.294117647059,fragment: false},{x:347.294,y:17.647058823529,fragment: false},{x:333.279,y:5.8823529411765,fragment: false},{x:331.3,y:5.8823529411765,fragment: false},{x:329.284,y:17.647058823529,fragment: false},{x:327.268,y:5.8823529411765,fragment: false},{x:319.263,y:0,fragment: false},{x:315.268,y:5.8823529411765,fragment: false},{x:301.253,y:5.8823529411765,fragment: false},{x:273.221,y:5.8823529411765,fragment: false},{x:158.081,y:5.8823529411765,fragment: false},{x:130.05,y:5.8823529411765,fragment: false},{x:111.08,y:5.8823529411765,fragment: false},{x:102.019,y:0,fragment: false},{x:76.0393,y:100,fragment: false},{x:74.0237,y:5.8823529411765,fragment: false},{x:59.0128,y:23.529411764706,fragment: false},{x:58.0287,y:17.647058823529,fragment: false},{x:30.0338,y:11.764705882353,fragment: false},{x:28.0182,y:11.764705882353,fragment: false} ]},{name: ' (+) 40 V [M+H]+ ',data:[{x:416.28,y:18.181818181818,fragment: false},{x:414.3,y:100,fragment: false},{x:388.321,y:18.181818181818,fragment: false},{x:386.305,y:54.545454545455,fragment: false},{x:372.29,y:9.0909090909091,fragment: false},{x:359.294,y:9.0909090909091,fragment: false},{x:357.279,y:63.636363636364,fragment: false},{x:355.263,y:9.0909090909091,fragment: false},{x:333.279,y:0,fragment: false},{x:331.3,y:18.181818181818,fragment: false},{x:331.263,y:9.0909090909091,fragment: false},{x:329.284,y:36.363636363636,fragment: false},{x:327.268,y:9.0909090909091,fragment: false},{x:317.284,y:9.0909090909091,fragment: false},{x:315.268,y:18.181818181818,fragment: false},{x:303.268,y:9.0909090909091,fragment: false},{x:301.253,y:18.181818181818,fragment: false},{x:275.237,y:9.0909090909091,fragment: false},{x:273.221,y:18.181818181818,fragment: false},{x:261.221,y:0,fragment: false},{x:111.08,y:9.0909090909091,fragment: false},{x:97.0648,y:9.0909090909091,fragment: false},{x:76.0393,y:45.454545454545,fragment: false},{x:59.0128,y:63.636363636364,fragment: false},{x:58.0287,y:81.818181818182,fragment: false},{x:55.0542,y:9.0909090909091,fragment: false},{x:44.9971,y:9.0909090909091,fragment: false},{x:41.0022,y:18.181818181818,fragment: false},{x:32.0495,y:9.0909090909091,fragment: false},{x:30.0338,y:36.363636363636,fragment: false},{x:28.0182,y:54.545454545455,fragment: false} ]}]
我无法访问该段数据并以有用的方式提取数据。我注意到它有一个JSON格式结构,但我无法从汤中获取它以利用它。
如果我不清楚我想要做什么,请告诉我。
下面是我的python脚本:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import xlwt
import xlrd
import requests
import re
import json
CASNUMBERS =xlrd.open_workbook("./OUTPUTFILE.xls")
CASNUMBERS_sheet = CASNUMBERS.sheet_by_index(0)
# import sqlite3
# conn = sqlite3.connect('CurationParsedData.db')
# c = conn.cursor()
#
# ##Create Table
# c.execute('''CREATE TABLE CurationParsedData(Exp_website TEXT,InSilico_website TEXT)''')
#
#
# def add_website(exp,insil):
# c.execute("INSERT INTO CurationParsedData VALUES("+ exp ","+ insil")")
# MID = "154"
# NAMEID = "glucose"
# CASID = "492-62-6"
# KEGGID = "C00267"
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
def ESILINK(number):
ESIlink = "https://metlin.scripps.edu/showChart.php?molid=" + number + "&h=240&collE=&Imode=p&etype=experimental"
return ESIlink
def insilico(number):
insilicolink = "https://metlin.scripps.edu/showChart.php?molid=" + number + "&h=240&collE=&Imode=p&etype=insilico"
return insilicolink
def metlinsearch(NAME= "",CAS = "", KEGG=""):
metlin_search = "https://metlin.scripps.edu/advanced_search_result.php?molid=&mass_min=&mass_max=&name=" + NAME + "&formula=&cas=" + CAS + "&kegg=" + KEGG + "&smilefile=&msmspeaks_min=&AminoAcid=add&drug=add&toxinEPA=add&smilesExactMatchCheckBox=false&nameExactMatchCheckBox=false"
return metlin_search
def HMDBsearch(number):
hmdb_search = "http://www.hmdb.ca/unearth/q?utf8=%E2%9C%93&query=" + number + "&searcher=metabolites&button="
return hmdb_search
for items in CASNUMBERS_sheet.col_values(2, 1):
print(items)
hmdbsearch_link = HMDBsearch(items)
print(hmdbsearch_link)
metlinsearch_link = metlinsearch(CAS = items)
metlinesearch_soup = make_soup(metlinsearch_link)
firstMID = metlinesearch_soup.find("th", {"scope": "row"})
allMID = metlinesearch_soup.findAll("th", {"scope": "row"})
ESI = "NO LINK"
INSILICO = "NO LINK"
if firstMID != None:
firstMID = firstMID.text
ESI = ESILINK(firstMID)
INSILICO = insilico(firstMID)
if allMID != None:
MIDlist = []
for items in allMID:
MIDlist.append(items.text)
esi = ESILINK(ESI)
sil = insilico(INSILICO)
print(ESI)
print(INSILICO)
# sil_soup = make_soup(sil)
sil_link = requests.get(INSILICO)
sil_soup = BeautifulSoup(sil_link.text, "lxml")
# print(sil_soup)
series = sil_soup.findAll('script', {"type": "text/javascript"})
series = series[3]
info = []
for x in series:
info.append(str(x))
for text in info:
head, body, tail = text.partition('series:')
tail = tail.replace(' ', '').replace(';', '').replace(' ', '')
print(tail)
json_string = tail
parse_json= json.loads(json_string)
print(parse_json['data'])
答案 0 :(得分:0)
我能够通过几个步骤获得我想要的输出,绝对不是最好和最佳的方式,但我首先将其转换为字符串 - &gt;分区到达我需要的部分 - &gt;删除了字符串中不感兴趣的字符 - &gt;通过for循环运行它以将数据重新组织回列表 - &gt;运行并分区x和y值 - &gt;转换为浮动 - &gt;值可用于绘图。
完成任务的方式非常圆,但在尝试解析JavaScript数据时可能会帮助有类似问题的人。
下面是代码:
from bs4 import BeautifulSoup
import urllib
import urllib.request
import xlwt
import xlrd
import requests
import re
import json
import matplotlib.pyplot as plt
CASNUMBERS =xlrd.open_workbook("./OUTPUTFILE.xls")
CASNUMBERS_sheet = CASNUMBERS.sheet_by_index(0)
# import sqlite3
# conn = sqlite3.connect('CurationParsedData.db')
# c = conn.cursor()
#
# ##Create Table
# c.execute('''CREATE TABLE CurationParsedData(Exp_website TEXT,InSilico_website TEXT)''')
#
#
# def add_website(exp,insil):
# c.execute("INSERT INTO CurationParsedData VALUES("+ exp ","+ insil")")
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
def ESILINK(number):
ESIlink = "https://metlin.scripps.edu/showChart.php?molid=" + number + "&h=240&collE=&Imode=p&etype=experimental"
return ESIlink
def insilico(number):
insilicolink = "https://metlin.scripps.edu/showChart.php?molid=" + number + "&h=240&collE=&Imode=p&etype=insilico"
return insilicolink
def metlinsearch(NAME= "",CAS = "", KEGG=""):
metlin_search = "https://metlin.scripps.edu/advanced_search_result.php?molid=&mass_min=&mass_max=&name=" + NAME + "&formula=&cas=" + CAS + "&kegg=" + KEGG + "&smilefile=&msmspeaks_min=&AminoAcid=add&drug=add&toxinEPA=add&smilesExactMatchCheckBox=false&nameExactMatchCheckBox=false"
return metlin_search
def HMDBsearch(number):
hmdb_search = "http://www.hmdb.ca/unearth/q?utf8=%E2%9C%93&query=" + number + "&searcher=metabolites&button="
return hmdb_search
for items in CASNUMBERS_sheet.col_values(2, 1):
print(items)
hmdbsearch_link = HMDBsearch(items)
print(hmdbsearch_link)
metlinsearch_link = metlinsearch(CAS = items)
metlinesearch_soup = make_soup(metlinsearch_link)
firstMID = metlinesearch_soup.find("th", {"scope": "row"})
allMID = metlinesearch_soup.findAll("th", {"scope": "row"})
ESI = "NO LINK"
INSILICO = "NO LINK"
if firstMID != None:
firstMID = firstMID.text
ESI = ESILINK(firstMID)
INSILICO = insilico(firstMID)
if allMID != None:
MIDlist = []
for items in allMID:
MIDlist.append(items.text)
esi = ESILINK(ESI)
sil = insilico(INSILICO)
print(ESI)
print(INSILICO)
if ESI != "NO LINK":
sil_link = requests.get(ESI)
sil_soup = BeautifulSoup(sil_link.text, "lxml")
# print(sil_soup)
series = sil_soup.findAll('script', {"type": "text/javascript"})
series = series[3]
info = []
for x in series:
info.append(str(x))
for text in info:
head, body, tail = text.partition('series:')
tail = tail.replace(' ', '').replace(';', '').replace(' ', '').replace('fragment:false', '').replace('fragment:true', '').replace('\n', '').replace('[', '').replace(']', '').replace('name:', '').replace('data:', '')
# print(tail)
identifyers = ['{', '}']
datalist = []
temp = ''
for data in tail:
if data != identifyers:
temp = temp + data
# print(temp)
# print('active1')
if data in identifyers:
datalist.append(temp)
temp = ''
# print("active2")
# print(datalist)
finallist = []
for items in datalist:
items = items.replace("}", '').replace('{', '').replace(',', '').replace(')', '').replace('(','')
if items != '':
finallist.append(items)
print(finallist)
for items in finallist:
if items[0] == "x":
head,body,tail = items.partition("x:")
head,body,tail = tail.partition("y:")
xvalue = round(float(head), 3)
yvalue = round(float(tail), 3)
print("x:",xvalue, "y:", yvalue)
else:
# items[1] == "-" or "+":
print("energy", items)