我有以下代码改编自here,我正在使用Node.js和Cheerio来读取html文件并将大型源文件拆分成小块。该代码适用于单个文件。
现在我需要读取多个大型html文件并将它们一个接一个地拆分并将结果文件输出到一个文件夹中。 如何读取和写入文件夹中的每个文件然后将其拆分?
以下是代码:
var cheerio = require('cheerio'),
fs = require('fs');
fs.readFile('./sourceHtml2/testone.html', 'utf8', dataLoaded);
function dataLoaded(err, data) {
$ = cheerio.load(data);
$('#toplevel > div').each(function (i, elem) {
var id = $(elem).attr('id'),
filename = id + '.html',
content = $.html(elem);
fs.writeFile('./output2/' + filename, content, function (err) {
console.log('Written html to ' + filename);
});
});
}
这是我的示例源文件
<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Lorem Ipsum</title>
</head>
<body>
<div id="toplevel">
<div id="1-1">
<h1>HTML Ipsum Presents One</h1>
<p>
<strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper.
<h2>Header Level 2</h2>
<ol>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ol>
<h3>Header Level 3</h3>
<ul>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ul>
</div>
<div id="1-2">
<h1>HTML Ipsum Presents Two</h1>
<p>
<strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper.
<h2>Header Level 2</h2>
<ol>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ol>
<blockquote>
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus magna. Cras in mi at felis aliquet congue. Ut a est eget ligula molestie gravida. Curabitur massa. Donec eleifend, libero at sagittis mollis, tellus est malesuada tellus,
at luctus turpis elit sit amet quam. Vivamus pretium ornare est.</p>
</blockquote>
<h3>Header Level 3</h3>
<ul>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ul>
</div>
<div id="1-3">
<h1>HTML Ipsum Presents Three</h1>
<p>
<strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper.
<h2>Header Level 2</h2>
<ol>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ol>
<blockquote>
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus magna. Cras in mi at felis aliquet congue. Ut a est eget ligula molestie gravida. Curabitur massa. Donec eleifend, libero at sagittis mollis, tellus est malesuada tellus,
at luctus turpis elit sit amet quam. Vivamus pretium ornare est.</p>
</blockquote>
<h3>Header Level 3</h3>
<ul>
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
<li>Aliquam tincidunt mauris eu risus.</li>
</ul>
</div>
</div>
</body>
</html>
非常感谢您的帮助。
答案 0 :(得分:1)
您需要将输入目录中的文件作为数组进行处理,并且还要防止输出文件夹中的文件名冲突。
下面提供的代码为这两个问题提供了解决方案。 HTML文件(.htm和.html)从'input'子文件夹中读取,生成的文件写入'output'子文件夹。
var cheerio = require('cheerio'),
fs = require('fs');
// process files found in the 'input' folder
fs.readdir('./input', 'utf8', findHtmlFiles);
function findHtmlFiles(err, files) {
if (files.length) {
files.forEach(function (fullFilename) {
var pattern = /\.[0-9a-z]{1,5}$/i;
var ext = (fullFilename).match(pattern);
// only process '.htm' and '.html' files
if (ext[0] == '.htm' || ext[0] == '.html') {
fs.readFile('./input/' + fullFilename, 'utf8', function (err, data) {
if (err)
throw err
else {
// add the file name to prevent collisions
// in the output folder
var fileData = {
file: fullFilename.slice(0, (ext[0].length * -1)),
data: data
};
dataLoaded(null, fileData);
}
});
}
});
}
}
function dataLoaded(err, fd) {
$ = cheerio.load(fd.data);
$('#toplevel > div').each(function (i, elem) {
var id = $(elem).attr('id'),
filename = fd.file + '_' + id + '.html',
content = $.html(elem);
fs.writeFile('./output/' + filename, content, function (err) {
console.log('Written html to ' + filename);
});
});
}
示例控制台输出:
Written html to testone_1-1.html
Written html to testone_1-2.html
Written html to testone_1-3.html
Written html to testtwo_1-1.html
Written html to testtwo_1-2.html
Written html to testtwo_1-3.html