F#中的模式匹配XML

时间:2012-09-09 12:49:43

标签: xml f# pattern-matching dsl fparsec

新图书馆:XParsec

这个问题导致F#3.0中的流和类型独立,非线性,可扩展的parsec实现 - 受FParsec的启发,从Chars和线性Streams中解放出来并简化:http://corsis.github.com/XParsec/


模式

1 = < font=?'Bold' bbox=F'l ..' s      >                       ;       < ~s >*
2 = < font=!'Bold' bbox=F'l ..' s=?'(' > | [ 1.l < 2.l ]       ;       < ~s >*
3 = < font=!'Bold' bbox=F'l ..' s=?')' > | [ 1.l < 3.l ]

,其中

element names are left unspecified
font, bbox and s are attributes

V = string, N = string

?  :: V -> bool                -- value          contains string
!  :: V -> bool = not . (?)    -- value does not contain  string
~  :: N -> bool                -- value of attribute N is empty or whitespace
F  :: V -> [(N, float)]        -- extracts a list of named floats from value
RM :: V -> bool                -- value matches regular expression
[] :: [bool]                   -- list of conditions

代码

open System.Xml.Linq
open System.Collections.Generic

let inline (-?-)  a b = (a : string).Contains b
let inline (~~) s = s |> String.IsNullOrWhiteSpace
let inline (!>) x = ( ^a : (static member op_Implicit : ^b -> ^a) x )

let inline (@)  (x : XElement) n   = let a = x.Attribute(!> n) in if a <> null then a.Value else String.Empty
let inline (@<) (x : XElement) n v = x.SetAttributeValue(!> n, v)

type XE = XElement IEnumerator

let inline bbox e    = (e @ "bbox") |> fun s -> s.Split [| ' ' |] |> Seq.map float |> Seq.toList
let inline left bbox = match bbox with l::_ -> l | _ -> nan

let mark n = let id = Guid.NewGuid() in Seq.iter <| fun e -> e @< "class-" + n <| id

let speaker  (n : XE) =
  let  c1 = n.Current
  if   c1 @? "font" <| "Bold"
  then let  l1 = c1 |> bbox |> left
       while n.MoveNext() && ~~(n.Current @ "s") do ()
       let  c2 = n.Current
       if  (c2 @ "font") -?- "Bold" |> not
       then let  l2 = c2 |> bbox |> left
            if   l1 < l2
            then let  s2 = c2 @ "s"
                 if        s2 -?- "("
                 then if   s2 -?- ")"
                      then [c1; c2] |> mark "speaker"
                      while n.MoveNext() && ~~(n.Current @ "s") do ()
                      let  c3 = n.Current
                      if  (c3 @ "font") -?- "Bold" |> not
                      then let  l3 = c3 |> bbox |> left
                           if   l1 < l3
                           then if   (c3 @ "s") -?- ")"
                                then [c1; c2; c3] |> mark "speaker"  


let test (x : XElement) =
  let spans = x.Descendants(!> "span") |> Seq.toArray
  for i = 29 to spans.Length - 1 do
    let n = (spans |> Seq.skip i).GetEnumerator()
    n.MoveNext() |> ignore
    speaker n

输入

<doc>
<block bbox="63.2999 550.846 246.865 561.875">
  <line bbox="63.2999 550.846 246.865 561.875">
    <span bbox="63.2999 550.846 189.001 561.875" font="TimesNewRoman,Bold" size="9.96" s="Dr. Frank-Walter Steinmeier " />
    <span bbox="189 550.846 246.865 561.875" font="TimesNewRoman" size="9.96" s="(SPD)  . . . . . ." />
  </line>
</block>
<block bbox="63.2999 567.766 246.875 578.796">
  <line bbox="63.2999 567.766 246.875 578.796">
    <span bbox="63.2999 567.766 136.004 578.796" font="TimesNewRoman,Bold" size="9.96" s="Rainer Brüderle " />
    <span bbox="136.02 567.766 246.875 578.796" font="TimesNewRoman" size="9.96" s="(FDP) . . . . . . . . . . . . . . . . ." />
  </line>
</block>
<block bbox="63.2999 584.626 250.351 651.456">
  <line bbox="63.2999 584.626 246.826 595.656">
    <span bbox="63.2999 584.626 152.105 595.656" font="TimesNewRoman,Bold" size="9.96" s="Sahra Wagenknecht " />
    <span bbox="152.16 584.626 246.826 595.656" font="TimesNewRoman" size="9.96" s="(DIE LINKE)  . . . . . . ." />
  </line>
  <line bbox="63.2999 600.362 250.351 613.34">
    <span bbox="63.2999 601.546 139.327 612.576" font="TimesNewRoman,Bold" size="9.96" s="Siegfried Kauder " />
    <span bbox="139.38 601.546 247.762 612.576" font="TimesNewRoman" size="9.96" s="(Villingen-Schwenningen) " />
    <span bbox="247.861 600.362 250.351 613.34" font="Symbol" size="9.96" s=" " />
  </line>
  <line bbox="74.6404 612.526 246.911 623.556">
    <span bbox="74.6404 612.526 246.911 623.556" font="TimesNewRoman" size="9.96" s="(CDU/CSU) . . . . . . . . . . . . . . . . . . . . . . . ." />
  </line>
  <line bbox="63.2999 628.202 191.909 641.18">
    <span bbox="63.2999 629.386 126.374 640.416" font="TimesNewRoman,Bold" size="9.96" s="Jürgen Trittin " />
    <span bbox="126.419 629.386 189.433 640.416" font="TimesNewRoman" size="9.96" s="(BÜNDNIS 90/" />
    <span bbox="189.419 628.202 191.909 641.18" font="Symbol" size="9.96" s=" " />
  </line>
  <line bbox="74.6394 640.426 246.813 651.456">
    <span bbox="74.6394 640.426 246.813 651.456" font="TimesNewRoman" size="9.96" s="DIE GRÜNEN)  . . . . . . . . . . . . . . . . . . . . ." />
  </line>
</block>
</doc>

输出

<doc>
<block>
  <line>
    <span font="TimesNewRoman,Bold" size="9.96" s="Dr. Frank-Walter Steinmeier " class-speaker="1f2e4dca-80d5-4c5e-91b6-6bd2e4a8acaf" />
    <span font="TimesNewRoman" size="9.96" s="(SPD)  . . . . . ." class-speaker="1f2e4dca-80d5-4c5e-91b6-6bd2e4a8acaf" />
  </line>
</block>
<block>
  <line>
    <span font="TimesNewRoman,Bold" size="9.96" s="Rainer Brüderle " class-speaker="eaa75d02-0ac6-4480-bcbe-f17bddfe6e81" />
    <span font="TimesNewRoman" size="9.96" s="(FDP) . . . . . . . . . . . . . . . . ." class-speaker="eaa75d02-0ac6-4480-bcbe-f17bddfe6e81" />
  </line>
</block>
<block>
  <line>
    <span font="TimesNewRoman,Bold" size="9.96" s="Sahra Wagenknecht " class-speaker="6b193f23-9b8b-4b37-9118-d8488fba25a2" />
    <span font="TimesNewRoman" size="9.96" s="(DIE LINKE)  . . . . . . ." class-speaker="6b193f23-9b8b-4b37-9118-d8488fba25a2" />
  </line>
  <line>
    <span font="TimesNewRoman,Bold" size="9.96" s="Siegfried Kauder " class-speaker="a0162e4e-1167-412a-ac11-ac13ef1aa46e" />
    <span font="TimesNewRoman" size="9.96" s="(Villingen-Schwenningen) " class-speaker="a0162e4e-1167-412a-ac11-ac13ef1aa46e" />
    <span font="Symbol" size="9.96" s=" " />
  </line>
  <line>
    <span font="TimesNewRoman" size="9.96" s="(CDU/CSU) . . . . . . . . . . . . . . . . . . . . . . . ." class-speaker="a0162e4e-1167-412a-ac11-ac13ef1aa46e" />
  </line>
  <line>
    <span font="TimesNewRoman,Bold" size="9.96" s="Jürgen Trittin " class-speaker="81fd6735-c57f-464b-a08f-7e7cb3bccfa8" />
    <span font="TimesNewRoman" size="9.96" s="(BÜNDNIS 90/" class-speaker="81fd6735-c57f-464b-a08f-7e7cb3bccfa8" />
    <span font="Symbol" size="9.96" s=" " />
  </line>
  <line>
    <span font="TimesNewRoman" size="9.96" s="DIE GRÜNEN)  . . . . . . . . . . . . . . . . . . . . ." class-speaker="81fd6735-c57f-464b-a08f-7e7cb3bccfa8" />
  </line>
</block>
</doc>

问题

要自动从简洁的模式声明转到运行代码,我正在考虑执行以下操作:

  • 使用FParsec解析模式声明到AST
  • 评估AST

但在我做任何事之前,我想知道:

  1. 任何人都可以编写一个(应用)EDSL(/部分)来直接使用F#函数和组合声明代码,而无需求助于AST吗?
  2. 是否有一个能够在XML上进行类似模式匹配的库?
  3. 有没有人对我的做法有任何意见?

2 个答案:

答案 0 :(得分:2)

IMO,你可以实现这样的事情的最好方法是将它分层到XLinq(LINQ-to-XML)之上。与“手动”实现所有模式匹配逻辑相比,编码更容易,更易于维护,并提供相同(如果不是更好)的性能。

请注意,您仍然可以使用DSL来定义要匹配的模式。基本上,您使用FParsec将您的DSL解析为AST,然后遍历AST并将其转换为等效的LINQ查询(请参阅System.Linq.Expressions namespace)。一旦你有了代表你的查询的LINQ表达式,你就可以将它应用于任意数量的XDocument来执行模式匹配。

您可能也有兴趣阅读Erik Meijer的论文XLinq: XML Programming Refactored (The Return Of The Monoids),该论文从功能设计的角度讨论XML编程。

答案 1 :(得分:1)

问题2:

我认为我不理解F#代码,但是我在Pascal(libraryonline example)中编写了一个匹配cli example的xml模式,您可能需要查看它。 (虽然我称模式为“模板”,它只选择xml节点,而不是修改它们。)

使用我的模板,

<doc>
<block>
  <line>
    <span>{.}</span>*
  </line>*
</block>*
</doc>

将匹配输入中的所有跨度。 (如同<span>{.}</span>*

或另一个例子:

<doc>
<block>
  <line>
    <span font="TimesNewRoman,Bold">{@s}</span>*
  </line>*
</block>*
</doc>

将匹配包含发言人姓名的属性。