首先让我为这个问题的规模道歉,但我真的想在功能上思考,这是我必须要处理的更具挑战性的问题之一。
我希望得到一些关于如何以功能方式处理问题的建议,特别是在F#中。我正在编写一个程序来浏览目录列表并使用正则表达式模式列表来过滤从目录中检索到的文件列表,并使用第二个正则表达式模式列表来查找已检索文件的文本中的匹配项。我希望这件事能够返回与给定正则表达式模式匹配的每段文本的文件名,行索引,列索引,模式和匹配值。此外,需要记录异常并且有3种可能的异常情况:无法打开目录,无法打开文件,从文件读取内容失败。最后的要求是匹配的“扫描”文件的数量可能非常大,所以整个事情需要很懒惰。我并不太担心“纯粹”的功能解决方案,因为我对一个阅读良好且性能良好的“好”解决方案感兴趣。最后一个挑战是使其与C#互操作,因为我想使用winform工具将此算法附加到ui。这是我的第一次尝试,希望这将澄清问题:
open System.Text.RegularExpressions
open System.IO
type Reader<'t, 'a> = 't -> 'a //=M['a], result varies
let returnM x _ = x
let map f m = fun t -> t |> m |> f
let apply f m = fun t -> t |> m |> (t |> f)
let bind f m = fun t -> t |> (t |> m |> f)
let Scanner dirs =
returnM dirs
|> apply (fun dirExHandler ->
Seq.collect (fun directory ->
try
Directory.GetFiles(directory, "*", SearchOption.AllDirectories)
with | e ->
dirExHandler e directory
Array.empty))
|> map (fun filenames ->
returnM filenames
|> apply (fun (filenamepatterns, lineExHandler, fileExHandler) ->
Seq.filter (fun filename ->
filenamepatterns |> Seq.exists (fun pattern ->
let regex = new Regex(pattern)
regex.IsMatch(filename)))
>> Seq.map (fun filename ->
let fileinfo = new FileInfo(filename)
try
use reader = fileinfo.OpenText()
Seq.unfold (fun ((reader : StreamReader), index) ->
if not reader.EndOfStream then
try
let line = reader.ReadLine()
Some((line, index), (reader, index + 1))
with | e ->
lineExHandler e filename index
None
else
None) (reader, 0)
|> (fun lines -> (filename, lines))
with | e ->
fileExHandler e filename
(filename, Seq.empty))
>> (fun files ->
returnM files
|> apply (fun contentpatterns ->
Seq.collect (fun file ->
let filename, lines = file
lines |>
Seq.collect (fun line ->
let content, index = line
contentpatterns
|> Seq.collect (fun pattern ->
let regex = new Regex(pattern)
regex.Matches(content)
|> (Seq.cast<Match>
>> Seq.map (fun contentmatch ->
(filename,
index,
contentmatch.Index,
pattern,
contentmatch.Value))))))))))
感谢您的任何意见。
更新 - 这是基于我收到的反馈的任何更新解决方案:
open System.Text.RegularExpressions
open System.IO
type ScannerConfiguration = {
FileNamePatterns : seq<string>
ContentPatterns : seq<string>
FileExceptionHandler : exn -> string -> unit
LineExceptionHandler : exn -> string -> int -> unit
DirectoryExceptionHandler : exn -> string -> unit }
let scanner specifiedDirectories (configuration : ScannerConfiguration) = seq {
let ToCachedRegexList = Seq.map (fun pattern -> new Regex(pattern)) >> Seq.cache
let contentRegexes = configuration.ContentPatterns |> ToCachedRegexList
let filenameRegexes = configuration.FileNamePatterns |> ToCachedRegexList
let getLines exHandler reader =
Seq.unfold (fun ((reader : StreamReader), index) ->
if not reader.EndOfStream then
try
let line = reader.ReadLine()
Some((line, index), (reader, index + 1))
with | e -> exHandler e index; None
else
None) (reader, 0)
for specifiedDirectory in specifiedDirectories do
let files =
try Directory.GetFiles(specifiedDirectory, "*", SearchOption.AllDirectories)
with e -> configuration.DirectoryExceptionHandler e specifiedDirectory; [||]
for file in files do
if filenameRegexes |> Seq.exists (fun (regex : Regex) -> regex.IsMatch(file)) then
let lines =
let fileinfo = new FileInfo(file)
try
use reader = fileinfo.OpenText()
reader |> getLines (fun e index -> configuration.LineExceptionHandler e file index)
with | e -> configuration.FileExceptionHandler e file; Seq.empty
for line in lines do
let content, index = line
for contentregex in contentRegexes do
for mmatch in content |> contentregex.Matches do
yield (file, index, mmatch.Index, contentregex.ToString(), mmatch.Value) }
同样,欢迎任何输入。
答案 0 :(得分:8)
我认为最好的方法是从最简单的解决方案开始然后扩展它。您目前的方法似乎很难读给我,理由有两个:
代码在F#中不太常见的模式中使用了很多组合器和函数组合。使用序列表达式可以更容易地编写一些处理。
代码都是作为单个函数编写的,但它相当复杂,如果它被分成多个函数,则会更具可读性。
我可能首先将代码分解为测试单个文件的函数(比如fileMatches
)和遍历文件并调用fileMatches
的函数。使用F#序列表达式可以很好地编写主要的迭代:
// Checks whether a file name matches a filename pattern
// and a content matches a content pattern
let fileMatches fileNamePatterns contentPatterns
(fileExHandler, lineExHandler) file =
// TODO: This can be imlemented using
// File.ReadLines which returns a sequence
// Iterates over all the files and calls 'fileMatches'
let scanner specifiedDirectories fileNamePatterns contentPatterns
(dirExHandler, fileExHandler, lineExHandler) = seq {
// Iterate over all the specified directories
for specifiedDir in specifiedDirectories do
// Find all files in the directories (and handle exceptions)
let files =
try Directory.GetFiles(specifiedDir, "*", SearchOption.AllDirectories)
with e -> dirExHandler e specifiedDir; [||]
// Iterate over all files and report those that match
for file in files do
if fileMatches fileNamePatterns contentPatterns
(fileExHandler, lineExHandler) file then
// Matches! Return this file as part of the result.
yield file }
该功能仍然非常复杂,因为你需要传递很多参数。将参数包装在简单类型或记录中可能是一个好主意:
type ScannerArguments =
{ FileNamePatterns:string
ContentPatterns:string
FileExceptionHandler:exn -> string -> unit
LineExceptionHandler:exn -> string -> unit
DirectoryExceptionHandler:exn -> string -> unit }
然后,您可以将fileMatches
和scanner
定义为仅包含两个参数的函数,这将使您的代码更具可读性。类似的东西:
// Iterates over all the files and calls 'fileMatches'
let scanner specifiedDirectories (args:ScannerArguments) = seq {
for specifiedDir in specifiedDirectories do
let files =
try Directory.GetFiles(specifiedDir, "*", SearchOption.AllDirectories)
with e -> args.DirectoryEceptionHandler e specifiedDir; [||]
for file in files do
// No need to propagate all arguments explicitly to other functions
if fileMatches args file then yield file }