我正在制作一个分析单词并尝试识别它们最常用的工具。我正在使用谷歌的Ngram数据集来实现这一目标。在我的代码中,我正在传输此数据(大约2千兆字节)。我正在将流数据转换为数组,每行数据作为一个条目。我想要做的是搜索数据中的某个单词,并将包含该单词的所有数组条目存储在变量中。我可以找到该单词是否在数据集中,并将该单词(或其在数据集中的位置)打印到控制台。我还在学习编程,所以如果我的代码很乱,请记住这一点。
// imports fs (filesystem) package duh
const fs = require('fs');
// the data stream
const stream = fs.createReadStream("/Users/user/Desktop/authortest_nodejs/testdata/testdata - p");
// gonna use this to keep track of whether ive found the search term or not
let found = false;
// this is the term the program looks for in the data
var search = "proceeded";
// lovely beautiful unclean way of turning my search term into regular expression
var searchThing = `\\b${search}`
var searchRegExp = new RegExp(searchThing, "g");
// starts streaming the test data file
stream.on('data', function(data) {
// if found is false (my search term isn''t found in this data chunk), set the found variable to true or false depending on whether it found anything
if (!found) found = !!('' + data).match(searchRegExp);
// turns raw data to a string and tries to find the location of the search term within it
var dataLoc = data.toString().search(searchRegExp);
var dataStr = data.toString().match(searchRegExp);
// if the data search is null, continue streaming (gotta do this cuz if .match() turns up with no results it throws an error smh)
if (!dataStr) return;
// removes the null spots and line breaks, pretty up the displayed stuff
var dataDisplay = dataStr.toString().replace("null", " ");
var dataLocDisplay = dataLoc.toString().replace(/(\r\n|\n|\r)/gm,"");
// turns each line of raw data into array
var dataArray = data.toString().split("\n");
// log found instances of search term (dunno why the hell id wanna do that, should fix to something useful) edit: commented it out cuz its too annoying
//console.log(dataDisplay);
// log location of word in string (there, more useful now?)
console.log(dataDisplay);
});
// what happens when the stream thing returns an error
stream.on('error', function(err) {
console.log(err, found);
});
// what happens when the stream thing finishes streaming
stream.on('close', function(err) {
console.log(err, found, searchRegExp);
});
这当前输出数据中搜索词的每个实例(基本上一个词重复一百次左右),但我需要包含搜索词的每一行的输出,而不仅仅是术语。 (“Proceeded 2006 5 3”,而不仅仅是“继续”)
答案 0 :(得分:0)
根据我的理解,你正在寻找这样的东西:
const fs = require('fs');
function grep(path, word) {
return new Promise((resolve) => {
let
stream = fs.createReadStream(path, {encoding: 'utf8'}),
buf = '',
out = [],
search = new RegExp(`\\b${word}\\b`, 'i');
function process(line) {
if (search.test(line))
out.push(line);
}
stream.on('data', (data) => {
let lines = data.split('\n');
lines[0] = buf + lines[0];
buf = lines.pop();
lines.forEach(process);
});
stream.on('end', () => {
process(buf);
resolve(out);
});
});
}
// works?
grep(__filename, 'stream').then(lines => console.log(lines))
我想这非常简单,需要buf
来模拟逐行阅读(您也可以使用readline
或专用模块)。