如何使用正则表达式解析通用的复杂URL?
我希望从URL字符串中获取信息,包括协议,主机名和路径。
答案 0 :(得分:4)
您可以使用正则表达式解析URL / URI。
示例高级网址如下:
http://login:password@www.example.org:80/demo/example.cgi?lang=de&foo=bar&empty#position
用于解析高级URL的RegExr类似于:
([^ :]*):\/\/(?:([^:]*):([^@]*)@|)([^/:]{1,}):?(\d*)?(\/[^? ]*)\??((?:[^=&# ]*=?[^&# ]*&?)*)#?([^ ]*)?
是的,它太疯狂了。但是,您可以从中获取以下字段(组):
#1 Protocol, #2 Login, #3 Password, #4 Host name, #5 Port, #6 Path, #7 Query, #8 Fragment
假设您有一些网址,并且只想知道主机名:
var myURL = "http://www.example.org/demo/example.cgi?lang=de&foo=bar&empty";
function getHostname(theURL) {
var Expr = /([^ :]*):\/\/(?:([^:]*):([^@]*)@|)([^/:]{1,}):?(\d*)?(\/[^? ]*)\??((?:[^=&# ]*=?[^&# ]*&?)*)#?([^ ]*)?/g,
match = Expr.exec(theURL);
if(match && match[0]) {
return match[4]; // #4th group of RegExpr
}
}
var myHostname = getHostname(myURL);
console.log(myHostname);

我创建了一个很好的表格,您可以在其中找到URL字符串的每个条目(第1组)的RegExpr:
| URL entry name | Example | Regular Expression |
| ----------------- | --------------------- | ------------------------------- |
| Protocol | http | ([^ :]*):\/\/ |
| Login | admin | \/\/([^:]*):[^@]*(?=@) |
| Password | 12345 | \/\/[^:]*:([^@]*)(?=@) |
| Host name | www.example.org | (?:@|\/\/)([^/:]{1,}) |
| Domain parts | www, example, org | (?:@|\/\/|\.)([^./:]*)(?=[./:]) |
| Port | 80 | :(\d*)\/[^/] |
| Path | /demo/example.cgi | \/\/([^/][^? ]*)\?? |
| File name | example.cgi | ([^?/]*(?!\/))\? |
| Query string | lang=de&foo=bar&empty | \?((?:[^=&# ]*=?[^&# ]*&?)*) |
| Fragment/position | position | #([^ ]*) |
此外,您可以使用([^=&# ]*)=?([^&# ]*)&?
解析查询字符串并迭代匹配项:
var myQueryString = "lang=de&foo=bar&empty";
function parseQueryString(theQueryString) {
var Expr = /([^=&# ]*)=?([^&# ]*)&?/g,
QueryEntries = {},
match;
// If no match left it returns ["", undefinied, undefinied],
// ["", "", ""] or null - depends on JavaScript engine/web browser.
// There is litte trick: "" and null is like false, so only check for [""].
while((match = Expr.exec(theQueryString)) && match[0]) {
QueryEntries[match[1]] = match[2] || '';
}
return QueryEntries;
}
var myQueryEntries = parseQueryString(myQueryString);
console.log(myQueryEntries);

您可以在http://regexr.com/上轻松测试您的RegExpr。
答案 1 :(得分:1)
不要使用正则表达式。使用URL解析器。
function parseURL(url) {
var a = document.createElement('a');
a.href = url;
return a;
}
var urlData = parseURL('https://username:password@sub.example.com:123/foo/bar?a=b#c');
console.log(urlData.protocol); // https:
console.log(urlData.username); // username
console.log(urlData.password); // password
console.log(urlData.host); // sub.example.com:123
console.log(urlData.hostname); // sub.example.com
console.log(urlData.port); // 123
console.log(urlData.pathname); // /foo/bar
console.log(urlData.search); // ?a=b
console.log(urlData.hash); // #c
console.log(urlData.origin); // https://sub.example.com:123
console.log(urlData.href); // https://username:password@sub.example.com:123/foo/bar?a=b#c
还有URL
interface。浏览器支持较少,但在语义上它可能比DOM元素更好。
var urlData = new URL('https://username:password@sub.example.com:123/foo/bar?a=b#c');
console.log(urlData.protocol); // https:
console.log(urlData.username); // username
console.log(urlData.password); // password
console.log(urlData.host); // sub.example.com:123
console.log(urlData.hostname); // sub.example.com
console.log(urlData.port); // 123
console.log(urlData.pathname); // /foo/bar
console.log(urlData.search); // ?a=b
console.log(urlData.hash); // #c
console.log(urlData.origin); // https://sub.example.com:123
console.log(urlData.href); // https://username:password@sub.example.com:123/foo/bar?a=b#c