如何使用JavaScript从一段文本中将URL提取到一个数组中

时间:2015-08-25 11:46:38

标签: javascript regex string

我在HTML中有一个文本框,用户可以在其中输入任何类型的文本。它也可以包含URL。

如何使用JavaScript从该文本中将URL提取到数组中?

This URL为如何替换文本提供了一个很好的示例。但我想将它们提取到一个数组中,以便我可以处理它们。

var userText = "Hello, I can't http://google.com for c*ap" 
var urls = getUrlsFromText(userText); 
console.log("Urls!",urls) 
--> must give ['http://google.com']

编辑:不重复,因为我在发布问题之前发现正则表达式检测到URL。我需要一种方法将匹配从字符串中提取到数组,

2 个答案:

答案 0 :(得分:3)

只需使用String.match()

function getUrlsFromText(input) {
    return input.match(/* your URL regex here */);
}

很容易找到网址的正则表达式,例如https://mathiasbynens.be/demo/url-regex

答案 1 :(得分:2)

您可以使用此代码:



function getURLsFromString(s) {
  var re = /((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=+$,\w]+@)?[A-Za-z0-9.-]+|(?:www\.|[-;:&=+$,\w]+@)[A-Za-z0-9.-]+)((?:\/[+~%\/.\w-]*)?\??(?:[-+=&;%@.\w]*)#?\w*)?)/gm; 
  var m;
  var arr = [];
  while ((m = re.exec(str)) !== null) {
    if (m.index === re.lastIndex) {
        re.lastIndex++;
    }
    arr.push(m[0]);
  }
  return arr;
}

var str = 'Hello, I can\'t http://google.com for c*ap';
console.log(getURLsFromString(str));




我清理了正则表达式,因为有太多的转义,-有一个问题。

如果您更喜欢Diego Perini's URL regex,请使用

var re = RegExp(
    // protocol identifier
    "(?:(?:https?|ftp)://)" +
    // user:pass authentication
    "(?:\\S+(?::\\S*)?@)?" +
    "(?:" +
      // IP address exclusion
      // private & local networks
      "(?!(?:10|127)(?:\\.\\d{1,3}){3})" +
      "(?!(?:169\\.254|192\\.168)(?:\\.\\d{1,3}){2})" +
      "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" +
      // IP address dotted notation octets
      // excludes loopback network 0.0.0.0
      // excludes reserved space >= 224.0.0.0
      // excludes network & broacast addresses
      // (first & last IP address of each class)
      "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" +
      "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" +
      "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" +
    "|" +
      // host name
      "(?:(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)" +
      // domain name
      "(?:\\.(?:[a-z\\u00a1-\\uffff0-9]-*)*[a-z\\u00a1-\\uffff0-9]+)*" +
      // TLD identifier
      "(?:\\.(?:[a-z\\u00a1-\\uffff]{2,}))" +
      // TLD may end with dot
      "\\.?" +
    ")" +
    // port number
    "(?::\\d{2,5})?" +
    // resource path
    "(?:[/?#]\\S*)?",
  "gi"
);

或者它的1-liner:

var re = /(?:(?:https?|ftp):\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,}))\.?)(?::\d{2,5})?(?:[\/?#]\S*)?/gi;