如何使用正则表达式来解析通用的复杂URL?

时间:2016-10-01 21:37:01

标签: javascript regex

如何使用正则表达式解析通用的复杂URL?

我希望从URL字符串中获取信息,包括协议,主机名和路径。

2 个答案:

答案 0 :(得分:4)

使用正则表达式解析URL

您可以使用正则表达式解析URL / URI。

示例高级网址如下:

http://login:password@www.example.org:80/demo/example.cgi?lang=de&foo=bar&empty#position

用于解析高级URL的RegExr类似于:

([^ :]*):\/\/(?:([^:]*):([^@]*)@|)([^/:]{1,}):?(\d*)?(\/[^? ]*)\??((?:[^=&# ]*=?[^&# ]*&?)*)#?([^ ]*)?
是的,它太疯狂了。但是,您可以从中获取以下字段(组):

#1 Protocol, #2 Login, #3 Password, #4 Host name, #5 Port, #6 Path, #7 Query, #8 Fragment

假设您有一些网址,并且只想知道主机名:



var myURL = "http://www.example.org/demo/example.cgi?lang=de&foo=bar&empty";

function getHostname(theURL) {
    var Expr = /([^ :]*):\/\/(?:([^:]*):([^@]*)@|)([^/:]{1,}):?(\d*)?(\/[^? ]*)\??((?:[^=&# ]*=?[^&# ]*&?)*)#?([^ ]*)?/g,
        match = Expr.exec(theURL);
    if(match && match[0]) {
        return match[4]; // #4th group of RegExpr
    }
}

var myHostname = getHostname(myURL);

console.log(myHostname);




我创建了一个很好的表格,您可以在其中找到URL字符串的每个条目(第1组)的RegExpr:

| URL entry name    | Example               | Regular Expression              |
| ----------------- | --------------------- | ------------------------------- |
| Protocol          | http                  | ([^ :]*):\/\/                   |
| Login             | admin                 | \/\/([^:]*):[^@]*(?=@)          |
| Password          | 12345                 | \/\/[^:]*:([^@]*)(?=@)          |
| Host name         | www.example.org       | (?:@|\/\/)([^/:]{1,})           |
| Domain parts      | www, example, org     | (?:@|\/\/|\.)([^./:]*)(?=[./:]) |
| Port              | 80                    | :(\d*)\/[^/]                    |
| Path              | /demo/example.cgi     | \/\/([^/][^? ]*)\??             |
| File name         | example.cgi           | ([^?/]*(?!\/))\?                |
| Query string      | lang=de&foo=bar&empty | \?((?:[^=&# ]*=?[^&# ]*&?)*)    |
| Fragment/position | position              | #([^ ]*)                        |

此外,您可以使用([^=&# ]*)=?([^&# ]*)&?解析查询字符串并迭代匹配项:



var myQueryString = "lang=de&foo=bar&empty";

function parseQueryString(theQueryString) {
    var Expr = /([^=&# ]*)=?([^&# ]*)&?/g,
        QueryEntries = {},
        match;
    
    // If no match left it returns ["", undefinied, undefinied], 
    // ["", "", ""] or null - depends on JavaScript engine/web browser.
    // There is litte trick: "" and null is like false, so only check for [""].
    while((match = Expr.exec(theQueryString)) && match[0]) {
        QueryEntries[match[1]] = match[2] || '';
    }
    return QueryEntries;
}

var myQueryEntries = parseQueryString(myQueryString);

console.log(myQueryEntries);




您可以在http://regexr.com/上轻松测试您的RegExpr。

答案 1 :(得分:1)

不要使用正则表达式。使用URL解析器。

function parseURL(url) {
  var a = document.createElement('a');
  a.href = url;
  return a;
}
var urlData = parseURL('https://username:password@sub.example.com:123/foo/bar?a=b#c');
console.log(urlData.protocol); // https:
console.log(urlData.username); // username
console.log(urlData.password); // password
console.log(urlData.host);     // sub.example.com:123
console.log(urlData.hostname); // sub.example.com
console.log(urlData.port);     // 123
console.log(urlData.pathname); // /foo/bar
console.log(urlData.search);   // ?a=b
console.log(urlData.hash);     // #c
console.log(urlData.origin);   // https://sub.example.com:123
console.log(urlData.href);     // https://username:password@sub.example.com:123/foo/bar?a=b#c

还有URL interface。浏览器支持较少,但在语义上它可能比DOM元素更好。

var urlData = new URL('https://username:password@sub.example.com:123/foo/bar?a=b#c');
console.log(urlData.protocol); // https:
console.log(urlData.username); // username
console.log(urlData.password); // password
console.log(urlData.host);     // sub.example.com:123
console.log(urlData.hostname); // sub.example.com
console.log(urlData.port);     // 123
console.log(urlData.pathname); // /foo/bar
console.log(urlData.search);   // ?a=b
console.log(urlData.hash);     // #c
console.log(urlData.origin);   // https://sub.example.com:123
console.log(urlData.href);     // https://username:password@sub.example.com:123/foo/bar?a=b#c