如何在puppeteer中下载,访问和处理复杂的txt文件?
我可以这样访问xml文件(Node.js puppeteer - Downloading/Accessing a xml file and process the content):
await page.goto(myPage, {waitUntil: 'load'});
const newPage = await page.evaluate(() => {
var columns = document.getElementsByTagName("VALUEA");
var values = {"values":[]};
for(let f in columns){
values.values.push(columns[f].innerText);
}
return JSON.stringify(values);
});
console.log(JSON.parse(newPage))
哪个返回我想要的值。但是,如果我想访问如下所示的txt文件...,并且要进行测试以获取所有“ VALUEA”,我会得到:
{"values":[null,null,null]}
这就是我正在查看的TXT文件:
<XYZ-DOCUMENT>117.txt : 20180824
<XYZ-HEADER>117.hdr.sgml : 20180824
<VALUE00>20180824153107
VALUE01: 117
VALUE02: ABC
COMPANY:
COMPANY DATA:
VALUE03: Some Company
VALUE04: 777
BUSINESS ADDRESS:
VALUE05: Some street
VALUE06: Some city
</XYZ-HEADER>
<DOCUMENT>
<VALUE07>ABC
<SEQUENCE>1
<FILENAME>primary_doc.xml
<TEXT>
<XML>
<?xml version="1.0" encoding="UTF-8"?>
<Submission xmlns="http://www.xyz.it/abc/" xmlns:com="http://www.xyz.it/abc/common">
<headerData>
<VALUE08>ABC</VALUE08>
<xxxInfo>
<xxx>
<credentials>
<VALUE09>777</VALUE09>
<VALUE10>XXXXXXXX</VALUE10>
</credentials>
</xxx>
<VALUE11>06-30-2018</VALUE11>
</xxxInfo>
</headerData>
<bodyData>
<coverPage>
<VALUE12>06-30-2018</VALUE12>
<VALUE13>1</VALUE13>
<amendmentInfo>
<VALUE14>STRAWBERRIES</VALUE14>
</amendmentInfo>
<xxxManager>
<VALUE15>Corp</VALUE15>
<address>
<VALUE16:street1>MOUNTAIN STREET</VALUE16:street1>
<VALUE17:city>NEW YORK</VALUE17:city>
</address>
</xxxManager>
<provideInfoForInstruction5>N</provideInfoForInstruction5>
</coverPage>
</bodyData>
</Submission>
</XML>
</TEXT>
</DOCUMENT>
<DOCUMENT>
<TYPE>INFORMATION TABLE
<SEQUENCE>2
<FILENAME>xml_xyz.xml
<TEXT>
<XML>
<?xml version="1.0" encoding="UTF-8"?>
<informationTable xmlns="http://www.xyz.it/abc/informationtable" xmlns:xsi=" http://www.w3.org/2001/XMLSchema-instance">
<infoTable>
<VALUEA>Company A</VALUEA>
<VALUEB>INC</VALUEB>
<shParent>
<VALUEC>123</VALUEC>
<VALUED>AB</VALUED>
</shParent>
</infoTable>
<infoTable>
<VALUEA>Company B</VALUEA>
<VALUEB>LTD</VALUEB>
<shParent>
<VALUEC>567</VALUEC>
<VALUED>ST</VALUED>
</shParent>
</infoTable>
...
</informationTable>
</XML>
</TEXT>
</DOCUMENT>
</XYZ-DOCUMENT>
最后,我要获取所有VALUE(主要是VALUEA,VALUEB,VALUEC,VALUEC)!怎么做?
答案 0 :(得分:1)
要在txt中将dom元素html / xml读取为一个可行的解决方案,是创建一个虚拟dom节点,如下所示:
var el = document.createElement( 'html' );
el.innerHTML = "<tagelement>content</tagelement>";
var columns = el.getElementsByTagName("tagelement");
在您的示例中:
const puppeteer = require('puppeteer');
(async()=> { const browser = await puppeteer.launch({ 无头:假 }); const page =等待browser.newPage(); await page.goto('http://example.com/file.txt',{waitUntil:'load'});
const newPage = await page.evaluate(() => {
var el = document.createElement( 'html' );
el.innerHTML = document.getElementsByTagName("pre")[0].innerText;
var allValues = el.querySelectorAll("VALUEA, VALUEB, VALUEC, VALUED");
var values = {};
for(let i = 0; i<allValues.length; i++){
if(!(allValues[i].nodeName in values)){
values[allValues[i].nodeName] = [];
}
values[allValues[i].nodeName].push(allValues[i].innerText);
}
values = JSON.stringify(values);
return values;
});
console.log(JSON.parse(newPage))
})();
答案 1 :(得分:1)
您可以使用以下解决方案从VALUEA
,VALUEB
,VALUEC
和VALUED
中获取文本内容:
const example = await page.evaluate( () =>
{
const page = document.createElement( 'html' );
const page_content = document.body.textContent;
page.innerHTML = page_content;
return {
'VALUEA' : Array.from( page.getElementsByTagName( 'VALUEA' ), e => e.textContent ),
'VALUEB' : Array.from( page.getElementsByTagName( 'VALUEB' ), e => e.textContent ),
'VALUEC' : Array.from( page.getElementsByTagName( 'VALUEC' ), e => e.textContent ),
'VALUED' : Array.from( page.getElementsByTagName( 'VALUED' ), e => e.textContent )
};
});
console.log( example.VALUEA[0] ); // Company A
console.log( example.VALUEA[1] ); // Company B
console.log( example.VALUEB[0] ); // INC
console.log( example.VALUEB[1] ); // LTD
console.log( example.VALUEC[0] ); // 123
console.log( example.VALUEC[1] ); // 567
console.log( example.VALUED[0] ); // AB
console.log( example.VALUED[1] ); // ST