我看了几个关于如何在JS中使用正则表达式的例子,但我似乎无法找到适合我需要的语法。基本上我有一系列的单词:
commonWords=["she", "he", "him", "liked", "i", "a", "an", "are"]
和一个字符串:
'She met him where he liked to eat "the best" cheese pizza.'
基本上我想使用非alphas和我的commonWords数组作为分隔符来提取短语。上面会产生这样的事情:
'met, where, to eat, the best, cheese pizza'
答案 0 :(得分:2)
你在寻找这样的东西:
var commonWords=["she", "he", "him", "liked", "i", "a", "an", "are"];
var regstr = "\\b(" + commonWords.join("|") + ")\\b";
//regex is \b(she|he|him|liked|i|a|an|are)\b
var regex = new RegExp(regstr, "ig");
var str = 'She met him where he liked to eat "the best" cheese pizza.';
console.log(str.replace(regex, ""));
输出
met where to eat "the best" cheese pizza.
split
版本:
var commonWords=["she", "he", "him", "liked", "i", "a", "an", "are"];
var regstr = "\\b(?:" + commonWords.join("|") + ")\\b";
var regex = new RegExp(regstr, "ig");
var str = 'She met him where he liked to eat "the best" cheese pizza.';
var arr = str.split(regex);
console.log(arr);// ["", " met ", " where ", " ", " to eat "the best" cheese pizza."]
for(var i = 0; i < arr.length; i++)
if(arr[i].match(/^\s*$/)) //remove empty strings and strings with only spaces.
arr.splice(i--, 1);
else
arr[i] = arr[i].replace(/^\s+|\s+$/g, ""); //trim spaces from beginning and end
console.log(arr);// ["met", "where", "to eat "the best" cheese pizza."]
console.log(arr.join(", "));// met, where, to eat "the best" cheese pizza.
答案 1 :(得分:1)
来自OP:
“基本上我想使用非alphas 和我的commonWords数组作为分隔符来提取短语。”
这两者都有(不像其他答案;-))。它返回一个字符串或一个数组。
var commonWords = ["she", "he", "him", "liked", "i", "a", "an", "are"];
var SourceStr = 'She met him where he liked to eat "the best" cheese pizza, didn\'t she, $%&#! Mr. O\'Leary?';
//--- Kill (most) non-alphas, and the keywords replace with tab.
var zRegEx = eval ('/([^0-9a-z\' ]+)|\\s*\\b(' + commonWords.join ("|") + ')\\b\\s*/ig');
var sPhraseList = SourceStr.replace (zRegEx, '\t');
//-- Trim empty results and leading and trailing delimiters.
sPhraseList = sPhraseList.replace (/ *\t+ */g, ', '). replace (/, ?, ?/g, ', ');
sPhraseList = sPhraseList.replace (/(^[, ]+)|([, ]+$)/g, '');
//-- Make optional array:
aPhraseList = sPhraseList.split (/, */g);
//-- Replace "console.log" with "alert" if you're not using Firebug.
console.log (SourceStr);
console.log (sPhraseList);
console.log (aPhraseList);
。
返回:
"met, where, to eat, the best, cheese pizza, didn't, Mr, O'Leary"
and
["met", "where", "to eat", "the best", "cheese pizza", "didn't", "Mr", "O'Leary"]
答案 2 :(得分:0)
这个版本非常详细,但也适用于“懒惰”的单引号和双引号:
如果数组包含具有不区分大小写的比较标志的对象(如indexOfObject
):
if (!Array.prototype.containsObject) Array.prototype.containsObject = function (object, caseInsensitive) {
for (var i = 0; i < this.length; i++) {
if (this[i] == object) return true;
if (!(caseInsensitive && (typeof this[i] == 'string') && (typeof object == 'string'))) continue;
return (this[i].match(RegExp(object, "i")) != null);
}
return false;
}
如果不是空的话,将对象推送到数组:
if (!Array.prototype.pushIfNotEmpty) Array.prototype.pushIfNotEmpty = function (object) {
if (typeof object == 'undefined') return;
if ((object && object.length) <= 0) return;
this.push(object);
}
规范化字符串:
function canonicalizeString (inString, whitespaceSpecifier) {
if (typeof inString != 'string') return '';
if (typeof whitespaceSpecifier != 'string') return '';
var whitespaceReplacement = whitespaceSpecifier + whitespaceSpecifier;
var canonicalString = inString.replace(whitespaceSpecifier, whitespaceReplacement);
var singleQuotedTokens = canonicalString.match(/'([^'s][^']*)'/ig);
for (tokenIndex in singleQuotedTokens) canonicalString = canonicalString.replace(singleQuotedTokens[tokenIndex], String(singleQuotedTokens[tokenIndex]).replace(" ", whitespaceReplacement));
var doubleQuotedTokens = canonicalString.match(/"([^"]*)"/ig);
for (tokenIndex in doubleQuotedTokens) canonicalString = canonicalString.replace(doubleQuotedTokens[tokenIndex], String(doubleQuotedTokens[tokenIndex]).replace(" ", whitespaceReplacement));
return canonicalString;
}
玩得开心:
function getSignificantTokensFromStringWithCommonWords (inString, inCommonWordsArray) {
if (typeof inString != 'string') return [];
if (typeof (inCommonWordsArray && inCommonWordsArray.length) != 'number') return [];
var canonicalString = canonicalizeString(inString, "_");
var commonWords = [];
for (indexOfCommonWord in inCommonWordsArray) commonWords.pushIfNotEmpty(canonicalizeString(inCommonWordsArray[indexOfCommonWord], "_"));
var tokenizedStrings = canonicalString.split(" ");
for (indexOfToken in tokenizedStrings)
if (commonWords.containsObject(tokenizedStrings[indexOfToken], true))
tokenizedStrings[indexOfToken] = undefined;
var responseObject = [];
for (indexOfToken in tokenizedStrings)
if (typeof tokenizedStrings[indexOfToken] == 'string')
responseObject.push(tokenizedStrings[indexOfToken]);
for (indexOfTokenInResponse in responseObject)
if (typeof responseObject[indexOfTokenInResponse] == 'string')
responseObject[indexOfTokenInResponse] = String(responseObject[indexOfTokenInResponse]).replace("__", " ");
return responseObject;
}