Node.js fs cheerio读写多个文件

时间:2016-07-27 22:14:54

标签: javascript node.js fs cheerio

我有以下代码改编自here,我正在使用Node.js和Cheerio来读取html文件并将大型源文件拆分成小块。该代码适用于单个文件。

现在我需要读取多个大型html文件并将它们一个接一个地拆分并将结果文件输出到一个文件夹中。 如何读取和写入文件夹中的每个文件然后将其拆分?

以下是代码:

var cheerio = require('cheerio'),
    fs = require('fs');

fs.readFile('./sourceHtml2/testone.html', 'utf8', dataLoaded);

function dataLoaded(err, data) {

  $ = cheerio.load(data);


  $('#toplevel > div').each(function (i, elem) {

    var id = $(elem).attr('id'),

        filename = id + '.html',
        content = $.html(elem);

    fs.writeFile('./output2/' + filename, content, function (err) {

        console.log('Written html to ' + filename);
    });
  });
}

这是我的示例源文件

<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <title>Lorem Ipsum</title>
  </head>
  <body>
    <div id="toplevel">
      <div id="1-1">
        <h1>HTML Ipsum Presents One</h1>
        <p>
        <strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper. 

        <h2>Header Level 2</h2>
        <ol>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ol>
        <h3>Header Level 3</h3>
        <ul>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ul>
      </div>
      <div id="1-2">
        <h1>HTML Ipsum Presents Two</h1>
        <p>
        <strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper. 

        <h2>Header Level 2</h2>
        <ol>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ol>
        <blockquote>
          <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus magna. Cras in mi at felis aliquet congue. Ut a est eget ligula molestie gravida. Curabitur massa. Donec eleifend, libero at sagittis mollis, tellus est malesuada tellus,
          at luctus turpis elit sit amet quam. Vivamus pretium ornare est.</p>
        </blockquote>
        <h3>Header Level 3</h3>
        <ul>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ul>
      </div>
      <div id="1-3">
        <h1>HTML Ipsum Presents Three</h1>
        <p>
        <strong>Pellentesque habitant morbi tristique</strong>senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper. 

        <h2>Header Level 2</h2>
        <ol>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ol>
        <blockquote>
          <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus magna. Cras in mi at felis aliquet congue. Ut a est eget ligula molestie gravida. Curabitur massa. Donec eleifend, libero at sagittis mollis, tellus est malesuada tellus,
          at luctus turpis elit sit amet quam. Vivamus pretium ornare est.</p>
        </blockquote>
        <h3>Header Level 3</h3>
        <ul>
          <li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li>
          <li>Aliquam tincidunt mauris eu risus.</li>
        </ul>
      </div>
    </div>
  </body>
</html>

非常感谢您的帮助。

1 个答案:

答案 0 :(得分:1)

您需要将输入目录中的文件作为数组进行处理,并且还要防止输出文件夹中的文件名冲突。

下面提供的代码为这两个问题提供了解决方案。 HTML文件(.htm和.html)从'input'子文件夹中读取,生成的文件写入'output'子文件夹。

var cheerio = require('cheerio'),
    fs = require('fs');

// process files found in the 'input' folder
fs.readdir('./input', 'utf8', findHtmlFiles);

function findHtmlFiles(err, files) {

    if (files.length) {
        files.forEach(function (fullFilename) {
            var pattern = /\.[0-9a-z]{1,5}$/i;
            var ext = (fullFilename).match(pattern);
            // only process '.htm' and '.html' files
            if (ext[0] == '.htm' || ext[0] == '.html') {
                fs.readFile('./input/' + fullFilename, 'utf8', function (err, data) {
                    if (err)
                        throw err
                    else {
                        // add the file name to prevent collisions
                        // in the output folder
                        var fileData = {
                            file: fullFilename.slice(0, (ext[0].length * -1)),
                            data: data
                        };
                        dataLoaded(null, fileData);
                    }
                });
            }
        });
    }

}

function dataLoaded(err, fd) {

    $ = cheerio.load(fd.data);

    $('#toplevel > div').each(function (i, elem) {

        var id = $(elem).attr('id'),
            filename = fd.file + '_' + id + '.html',
            content = $.html(elem);

        fs.writeFile('./output/' + filename, content, function (err) {

            console.log('Written html to ' + filename);
        });
    });
}

示例控制台输出:

Written html to testone_1-1.html
Written html to testone_1-2.html
Written html to testone_1-3.html
Written html to testtwo_1-1.html
Written html to testtwo_1-2.html
Written html to testtwo_1-3.html