我跳了很多圈来获得这个字符串:
"<html>\n<head>\n<script language=\"JavaScript\"> \n\n //////////////////////////////////////////////////////////////// \n // This [base64 encoder and decoder] was written by Tyler Akins and has been placed in the \n // public domain. It would be nice if you left this header intact. \n // Base64 code from Tyler Akins -- http://rumkin.com \n var keyStr = \"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\"; \n\n function encode64(input) { \n var output = \"\"; \n var chr1, chr2, chr3; \n var enc1, enc2, enc3, enc4; \n var i = 0; \n\n do { \n chr1 = input.charCodeAt(i++); \n chr2 = input.charCodeAt(i++); \n chr3 = input.charCodeAt(i++); \n\n enc1 = chr1 >> 2; \n enc2 = ((chr1 & 3) << 4) | (chr2 >> 4); \n enc3 = ((chr2 & 15) << 2) | (chr3 >> 6); \n enc4 = chr3 & 63; \n\n if (isNaN(chr2)) { \n enc3 = enc4 = 64; \n } else if (isNaN(chr3)) { \n enc4 = 64; \n } \n\n output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2) + \n keyStr.charAt(enc3) + keyStr.charAt(enc4); \n } while (i < input.length); \n\n return output; \n } \n function decode64(input) { \n var output = \"\"; \n var chr1, chr2, chr3; \n var enc1, enc2, enc3, enc4; \n var i = 0; \n\n // remove all characters that are not A-Z, a-z, 0-9, +, /, or = \n input = input.replace(/[^A-Za-z0-9\\+\\/\\=]/g, \"\"); \n\n do { \n enc1 = keyStr.indexOf(input.charAt(i++)); \n enc2 = keyStr.indexOf(input.charAt(i++)); \n enc3 = keyStr.indexOf(input.charAt(i++)); \n enc4 = keyStr.indexOf(input.charAt(i++)); \n\n chr1 = (enc1 << 2) | (enc2 >> 4); \n chr2 = ((enc2 & 15) << 4) | (enc3 >> 2); \n chr3 = ((enc3 & 3) << 6) | enc4; \n\n output = output + String.fromCharCode(chr1); \n\n if (enc3 != 64) { \n output = output + String.fromCharCode(chr2); \n } \n if (enc4 != 64) { \n output = output + String.fromCharCode(chr3); \n } \n } while (i < input.length); \n\n return output; \n } \n\n // end of Tyler Akins' code \n //////////////////////////////////////////////////////////////// \n function escapePluses(s) { \n return s.replace(/\\+/g, \"%2B\"); \n } \n function getFragment(thisuri) { \n var pound = thisuri.indexOf(\"#\"); \n if (pound == -1) { \n return null; \n } else { \n return thisuri.substr(pound + 1); \n } \n } \n function saveFragment() { \n var fragment = getFragment(document.URL); \n if (fragment != null) { \n var pre_marker = \"&aka_frag=\"; \n var g_req = decode64(document.relay.pubcookie_g_req.value); \n var header_end = g_req.indexOf(pre_marker) + pre_marker.length; \n var req_head = g_req.substr(0,header_end); \n var req_foot = g_req.substr(header_end); \n if ((req_foot.length > 0) && (req_foot.charAt(0) != '&')) { \n req_foot = req_foot.substr(req_foot.indexOf(\"&\")); \n } \n var new_req = req_head + escapePluses(encode64(fragment)) + req_foot; \n document.relay.pubcookie_g_req.value = encode64(new_req); \n } \n } \n\n function doStuff() { \n saveFragment(); \n document.relay.submit(); \n } \n\n// setTimeout('doStuff()', 1000); \n</script></head>\n<body onLoad=\"doStuff()\">\n<form method=post action=\"https://weblogin.server.com/\" name=relay>\n<input type=hidden name=pubcookie_g_req value=\"b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA==\">\n<input type=hidden name=post_stuff value=\"\">\n<input type=hidden name=relay_url value=\"https://siam-pro.qa.server.com/PubCookie.reply\">\n<noscript>\n<p align=center>You do not have Javascript turned on, please click the button to continue.\n<p align=center>\n<input type=submit name=go value=Continue>\n</noscript>\n</form>\n</html>\n"
然后我想在这个正则表达式上匹配该字符串:
<form [^>]*action=(?:\\*"([^"]*)\\*"|([^" >]*))[^>]* name=relay>(.*?)<\/form>
这在rubular中按预期工作,但在IRB(1.9.3)中,我得到以下内容:
1.9.3p448 :147 > data =~/<form [^>]*action=(?:\\*"([^"]*)\\*"|([^" >]*))[^>]* name=relay>(.*?)<\/form>/
=> nil
我在这里做错了什么?
答案 0 :(得分:3)
正则表达式和HTML / XML不是好伙伴。 当 HTML改变时,你的模式将会破坏的可能性非常大。解析器可以降低代码突破的可能性。例如,很容易预测标签中的参数可以改变它们的顺序:
<form method="post" action="https://weblogin.server.com/" name="relay">
表单可能会更改为以下其中一个:
<form method="post" action="https://weblogin.server.com/" name="relay" >...</form>
<form method="post" action="https://weblogin.server.com/" name="relay1" >...</form>
<form name="relay" method="post" action="https://weblogin.server.com/">...</form>
<form name="relay" method="post" action="https://weblogin.server.com/">...</form >
如果其中任何一个发生,正则表达式立即被破坏。
解析器不会关心这些更改。
require 'nokogiri'
html = "<html>\n<head>\n<script language=\"JavaScript\"> \n\n //////////////////////////////////////////////////////////////// \n // This [base64 encoder and decoder] was written by Tyler Akins and has been placed in the \n // public domain. It would be nice if you left this header intact. \n // Base64 code from Tyler Akins -- http://rumkin.com \n var keyStr = \"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\"; \n\n function encode64(input) { \n var output = \"\"; \n var chr1, chr2, chr3; \n var enc1, enc2, enc3, enc4; \n var i = 0; \n\n do { \n chr1 = input.charCodeAt(i++); \n chr2 = input.charCodeAt(i++); \n chr3 = input.charCodeAt(i++); \n\n enc1 = chr1 >> 2; \n enc2 = ((chr1 & 3) << 4) | (chr2 >> 4); \n enc3 = ((chr2 & 15) << 2) | (chr3 >> 6); \n enc4 = chr3 & 63; \n\n if (isNaN(chr2)) { \n enc3 = enc4 = 64; \n } else if (isNaN(chr3)) { \n enc4 = 64; \n } \n\n output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2) + \n keyStr.charAt(enc3) + keyStr.charAt(enc4); \n } while (i < input.length); \n\n return output; \n } \n function decode64(input) { \n var output = \"\"; \n var chr1, chr2, chr3; \n var enc1, enc2, enc3, enc4; \n var i = 0; \n\n // remove all characters that are not A-Z, a-z, 0-9, +, /, or = \n input = input.replace(/[^A-Za-z0-9\\+\\/\\=]/g, \"\"); \n\n do { \n enc1 = keyStr.indexOf(input.charAt(i++)); \n enc2 = keyStr.indexOf(input.charAt(i++)); \n enc3 = keyStr.indexOf(input.charAt(i++)); \n enc4 = keyStr.indexOf(input.charAt(i++)); \n\n chr1 = (enc1 << 2) | (enc2 >> 4); \n chr2 = ((enc2 & 15) << 4) | (enc3 >> 2); \n chr3 = ((enc3 & 3) << 6) | enc4; \n\n output = output + String.fromCharCode(chr1); \n\n if (enc3 != 64) { \n output = output + String.fromCharCode(chr2); \n } \n if (enc4 != 64) { \n output = output + String.fromCharCode(chr3); \n } \n } while (i < input.length); \n\n return output; \n } \n\n // end of Tyler Akins' code \n //////////////////////////////////////////////////////////////// \n function escapePluses(s) { \n return s.replace(/\\+/g, \"%2B\"); \n } \n function getFragment(thisuri) { \n var pound = thisuri.indexOf(\"#\"); \n if (pound == -1) { \n return null; \n } else { \n return thisuri.substr(pound + 1); \n } \n } \n function saveFragment() { \n var fragment = getFragment(document.URL); \n if (fragment != null) { \n var pre_marker = \"&aka_frag=\"; \n var g_req = decode64(document.relay.pubcookie_g_req.value); \n var header_end = g_req.indexOf(pre_marker) + pre_marker.length; \n var req_head = g_req.substr(0,header_end); \n var req_foot = g_req.substr(header_end); \n if ((req_foot.length > 0) && (req_foot.charAt(0) != '&')) { \n req_foot = req_foot.substr(req_foot.indexOf(\"&\")); \n } \n var new_req = req_head + escapePluses(encode64(fragment)) + req_foot; \n document.relay.pubcookie_g_req.value = encode64(new_req); \n } \n } \n\n function doStuff() { \n saveFragment(); \n document.relay.submit(); \n } \n\n// setTimeout('doStuff()', 1000); \n</script></head>\n<body onLoad=\"doStuff()\">\n<form method=post action=\"https://weblogin.server.com/\" name=relay>\n<input type=hidden name=pubcookie_g_req value=\"b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA==\">\n<input type=hidden name=post_stuff value=\"\">\n<input type=hidden name=relay_url value=\"https://siam-pro.qa.server.com/PubCookie.reply\">\n<noscript>\n<p align=center>You do not have Javascript turned on, please click the button to continue.\n<p align=center>\n<input type=submit name=go value=Continue>\n</noscript>\n</form>\n</html>\n"
doc = Nokogiri::HTML(html)
form = doc.at('form')
puts form.to_html
# >> <form method="post" action="https://weblogin.server.com/" name="relay">
# >> <input type="hidden" name="pubcookie_g_req" value="b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA=="><input type="hidden" name="post_stuff" value=""><input type="hidden" name="relay_url" value="https://siam-pro.qa.server.com/PubCookie.reply"><noscript>
# >> <p align="center">You do not have Javascript turned on, please click the button to continue.
# >> </p>
# >> <p align="center">
# >> <input type="submit" name="go" value="Continue"></p>
# >> </noscript>
# >> </form>
form['action'] # => "https://weblogin.server.com/"
input = form.at('input')
input['name'] # => "pubcookie_g_req"
input['value'] # => "b25lPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdHdvPXNpYW0tcHJvLnFhLmFrYW1haS5jb20mdGhyZWU9MSZmb3VyPWE1YSZmaXZlPUdFVCZzaXg9c2lhbS1wcm8ucWEuYWthbWFpLmNvbSZzZXZlbj1MMk52Ym1acFozQmhjbk5sWDNCdmNuUnomZWlnaHQ9JmFrYV9mcmFnPSZob3N0bmFtZT1zaWFtLXByby5xYS5ha2FtYWkuY29tJm5pbmU9MSZmaWxlPSZyZWZlcmVyPShudWxsKSZzZXNzX3JlPTAmcHJlX3Nlc3NfdG9rPS0xNTE4MTQyNjAwJmZsYWc9MA=="
Nokogiri是Ruby最喜欢的XML / HTML解析器。它使用起来快速,易用,而且非常强大。
答案 1 :(得分:0)
您需要在此使用多行正则表达式 - 使用m
修饰符启用多行匹配,例如
/<form [^>]*action=(?:\\*"([^"]*)\\*"|([^" >]*))[^>]* name=relay>(.*?)<\/form>/m