所以,我正在看的html数据是:
<A HREF="/data/client/Action.log">Action.log</A><br> 6/8/2015 3:45 PM
由此我需要提取Action.log的实例,
我的问题是我已经完成了大量的正则表达式教程,我似乎仍然无法通过大脑提取模式来提取它。我想我对正则表达式缺乏一些基本的理解,但任何帮助都会受到赞赏。
编辑:
internal string[] ParseFolderIndex_Alpha(string url, WebDirectory directory)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 3 * 60 * 1000;
request.KeepAlive = true;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK)
{
List<string> fileLocations = new List<string>(); string line;
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
{
while ((line = reader.ReadLine()) != null)
{
int index = line.IndexOf("<a href=");
if (index >= 0)
{
string[] segments = line.Substring(index).Split('\"');
///Can Parse File Size Here: Add todo
if (!segments[1].Contains("/"))
{
fileLocations.Add(segments[1]);
UI.UpdatePatchNotes("Web File Found: " + segments[1]);
UI.UpdateProgressBar();
}
else
{
if (segments[1] != @"../")
{
directory.SubDirectories.Add(new WebDirectory(url + segments[1], this));
UI.UpdatePatchNotes("Web Directory Found: " + segments[1].Replace("/", string.Empty));
}
}
}
else if (line.Contains("</pre")) break;
}
}
response.Dispose(); /// After ((line = reader.ReadLine()) != null)
return fileLocations.ToArray<string>();
}
else return new string[0]; /// !(HttpStatusCode.OK)
}
catch (Exception e)
{
LogHandler.LogErrors(e.ToString(), this);
LogHandler.LogErrors(url, this);
return null;
}
}
这就是我在做的,问题是我改变了服务器并且html IIS显示的是不同的所以我必须制作新的逻辑。
修改/ 结论:
首先,对不起,我甚至提到正则表达式:P其次,每个平台都必须根据环境单独处理。
这就是我目前收集文件名的方式。
internal string[] ParseFolderIndex(string url, WebDirectory directory)
{
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 3 * 60 * 1000;
request.KeepAlive = true;
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
bool endMet = false;
if (response.StatusCode == HttpStatusCode.OK)
{
List<string> fileLocations = new List<string>(); string line;
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
{
while (!endMet)
{
line = reader.ReadLine();
if (line != null && line != "" && line.IndexOf("</A>") >= 0)
{
if (line.Contains("</html>")) endMet = true;
string[] segments = line.Replace("\\", "").Split('\"');
List<string> paths = new List<string>();
List<string> files = new List<string>();
for (int i = 0; i < segments.Length; i++)
{
if (!segments[i].Contains('<'))
paths.Add(segments[i]);
}
paths.RemoveAt(0);
foreach (String s in paths)
{
string[] secondarySegments = s.Split('/');
if (s.Contains(".") || s.Contains("Verinfo"))
files.Add(secondarySegments[secondarySegments.Length - 1]);
else
{
directory.SubDirectories.Add(new WebDirectory
(url + "/" + secondarySegments[secondarySegments.Length - 2], this));
UI.UpdatePatchNotes("Web Directory Found: " + secondarySegments[secondarySegments.Length - 2]);
}
}
foreach (String s in files)
{
if (!String.IsNullOrEmpty(s) && !s.Contains('%'))
{
fileLocations.Add(s);
UI.UpdatePatchNotes("Web File Found: " + s);
UI.UpdateProgressBar();
}
}
if (line.Contains("</pre")) break;
}
}
}
response.Dispose(); /// After ((line = reader.ReadLine()) != null)
return fileLocations.ToArray<string>();
}
else return new string[0]; /// !(HttpStatusCode.OK)
}
catch (Exception e)
{
LogHandler.LogErrors(e.ToString(), this);
LogHandler.LogErrors(url, this);
return null;
}
}
答案 0 :(得分:1)
答案 1 :(得分:1)
正则表达式是矫枉过正。 它太重了,考虑到字符串的格式总是一样的,你会发现使用拆分和子串更容易调试和维护。
class Program {
static void Main(string[] args) {
String s = "<A HREF=\"/data/client/Action.log\">Action.log</A><br> 6/8/2015 3:45 PM ";
String[] t = s.Split('"');
String fileName = String.Empty;
//To get the entire file name and path....
fileName = t[1].Substring(0, (t[1].Length));
//To get just the file name (Action.log in this case)....
fileName = t[1].Substring(0, (t[1].Length)).Split('/').Last();
}
}
答案 2 :(得分:-1)
string text = @"<A HREF=""/data/client/Action.log"">Action.log</A><br> 6/8/2015 3:45 PM";
var match = Regex.Match(text, @"^<A HREF=\""\/data\/client\/.*\.log\"">(.*)</A>.*$");
var result = match.Groups[1].Value;
尝试http://regexr.com/或Regexbuddy!