使用Mailparser

时间:2017-11-24 13:50:57

标签: javascript node.js mongodb email-parsing

我对这个话题很陌生,我的邮件分发器仍然存在一些问题。虽然在电子邮件标题(mail.from)中搜索和查找电子邮件确实有效,但它在电子邮件正文中不起作用。有没有人有这方面的经验,愿意帮忙吗?您可以在“//检查邮件正文中的其他地址(尚未使用)” - 评论下找到我正在讨论的功能。我想,我的正则表达式是正确的。此外,如果matchAll-Function返回一个数组并且无法保存在subscriber.email-object中,它至少应该记录到控制台。如果在邮件正文中有邮件地址的邮件,我也会在收件箱中手动检查。至少有两个,应该找到..

执行mailparsing的 App.js 部分:

const simpleParser = require('mailparser').simpleParser;
//const htmlparser = require("htmlparser2");
var fs = require('fs');

var config = require('./config');

var Imap = require('imap');
var imap = new Imap(config.imap);

var blacklistString = '';

String.prototype.matchAll = function(regexp) {
    var matches = [];
    this.replace(regexp, function() {
        var arr = ([]).slice.call(arguments, 0);
        var extras = arr.splice(-2);
        arr.index = extras[0];
        arr.input = extras[1];
        matches.push(arr);
    });
    return matches.length ? matches : null;
 };

function openInbox(subbox,cb) {
    imap.openBox('INBOX.'+subbox, true, cb);
}

function getBoxes(cb) {
    imap.getBoxes(cb);
}
function showBoxes(boxes) {
    imap.end();
}

function logArrayElements(element) {
    if(element[1].indexOf('placeholder.de')==-1){
        addToBlacklistString(element[1]);
    }
}
 function addToBlacklistString(str) {
    blacklistString += str+"\n";
}
function writeBlacklistFile() {
    fs.appendFile('data/data.csv', blacklistString, function (err) {
        if (err) throw err;
        console.log('Saved!');
    });
}



function search(searchArray, regex){
     imap.search(searchArray, function(err, results) {
        if (err) throw err;
        var temp = 0;
        var mailtemp = [];
        var f = imap.fetch(results, { bodies: '' });
        f.on('message', function(msg, seqno) {
            console.log('Message #%d', seqno);
            var prefix = '(#' + seqno + ') ';
             msg.on('body', function(stream, info) {
                simpleParser(stream, (err, mail)=>{
                    //console.log(temp);
                    //console.log(mail.subject);
                      /*fs.writeFile('data/'+seqno+'.txt',mail.text, function(err){
                        console.log(err);
                     });*/

                    //var text = mail.text;

                    // New Subscriber Object
                     var subscr = new Subscriber({nr: '', mailIdent: '', from: '', emails: '', text:'', uLink: '', anwalt: false });
                     subscr.nr = seqno;

                    //Check for From-Address

                    if(!!mail.from) {
                         //console.log(mail.from.value);
                        for(var i = 0; i < mail.from.value.length; i++) {
                            mailtemp = mail.from.value[i].address.matchAll(regex);

                             mailtemp.forEach(function(element){
                            /*fs.appendFile('data/data.csv', element[0] + "\n", function(error){
                                console.log(error);
                            });*/

                            subscr.from = element[0];

                        });
                        if(!!mailtemp) {
                            mailtemp.forEach(logArrayElements);
                        }

                    }

                }else{
                    //console.log(mail.text);
                }

                // Message-ID

                if(!!mail.messageId) {
                    subscr.mailIdent = mail.messageId;
                }
                console.log(mail.messageId);

                // Check for other addresses in Mail-Body (Doesn't work yet)

                var regexEmails = new RegExp('/([\w\.\-\_\#\+]+@[\w\.\-\_äüö]+\.[a-zA-Z]+)/g');
                if(!!mail.text){
                    if(mail.text.matchAll(regexEmails)!=null) {
                        subscr.emails = mail.text.matchAll(regexEmails);
                        console.log(subscr.emails);
                    }
                }

                /* Split mail.text at substrings in substr-array. Extend if necessary..
                 *
                 * Also check for 'Anwalt'-Expression in splitted Substring
                 *
                 * If mail.text doesn't exist -> Check for html body and convert it to text-format
                 */

                //var  regexLink = new RegExp('\.de\/(unsubscribe|austragen)\/([^\"]+)');
                var  regexAnwalt = new RegExp('nwalt|echtsanwalt|rechtlicher');

                if(!!mail.text) {
                    var substr = ["schrieb pplaceholder.de", "Von: \"placeholder.de", "Von: pplaceholder.de", "From: placeholder.de", "Ursprüngliche Nachricht"];
                    for (var i = 0; i<substr.length; i++) {
                        if(mail.text.indexOf(substr[i]) > -1) {
                            var textTemp = mail.text;
                            var arr = textTemp.split(substr[i]);
                            if(arr[0].matchAll(regexAnwalt)!=null) {
                                subscr.anwalt = true;
                            };

                            subscr.text = arr[0];
                            break;
                        } else {
                            subscr.text = mail.text;
                        }
                    }


                    //console.log(arr);

                }
                else
                {
                    var html = mail.html;
                    var text = htmlToText.fromString(html, {
                        noLinkBrackets: true,
                        ignoreImage: true,
                        uppercaseHeadings: false,
                        preserveNewlines: false,
                        wordwrap:130,
                        format: {
                            heading: function (node, fn, options) {
                                var h = fn(node.children, options);
                                return '\n==== ' + h + ' ====\n\n';
                            }
                        }
                    });
                    subscr.text = text;
                }


                mail.headers.forEach(function(value, key) {
                    //console.log(value);
                });

                subscr.save();
                //console.log(subscr);
                temp++;
            });
        });
        msg.once('end', function() {
            console.log(prefix + 'Finished');
        });
    });
    f.once('error', function(err) {
        console.log('Fetch error: ' + err);
    });
    f.once('end', function() {
        console.log('Done fetching all messages!');
        //writeBlacklistFile();
        imap.end();
    });
});
}

imap.once('ready', function() {
 openInbox('Test',function(err, box) {
    var searchArray = [['FROM', '@']];
    search(searchArray,/([\w\.\-\_\#\+]+@[\w\.\-\_äüö]+\.[a-zA-Z]+)/g);
    });
 });
imap.once('error', function(err) {
console.log(err);
});

imap.once('end', function() {
    console.log('Connection ended');
});

imap.connect();

app.listen(2700, function(){
  console.log("Listening on Port 2700")
});

module.exports = app;

subscriber.js

const mongoose = require('mongoose');

var subscriberSchema = mongoose.Schema({

    nr: Number,
    mailIdent: String,
    from: String,
    emails: String,
    text: String,
    uLink: String,
    anwalt: Boolean
});

var Subscriber = module.exports = mongoose.model('Subscriber', subscriberSchema);

//get Subscriber
module.exports.getSubscribers = function(callback, limit){
Subscriber.find(callback).limit(limit);
};

module.exports.getSubscriberByID = function(_id, callback){
    Subscriber.findById(_id, callback);
};

1 个答案:

答案 0 :(得分:0)

电子邮件的正则表达式有点不对劲。 我也没有注意到matchAll-Fct。正在回馈一个二维数组。以下是代码的更改部分:

var regexEmails = new RegExp("([\\w\\.\\-\\_\\#\\+]+@[\\w\\.\\-\\_äüö]+\\.[a-zA-Z]+)");

var temp1 = mail.text.matchAll(regexEmails);
                    if(!!temp1){
                        //console.log(temp1);
                        for(var i =0; i<temp1.length; i++) {
                            if(temp1[0][i]!=='info@service.placeholder.de' && temp1[0][i] !== "info@placeholder.de"){
                                subscr.emails += temp1[0][i];
                            }
                        }

                    }