克隆节点并使用xquery修改克隆

时间:2018-09-15 14:00:15

标签: xquery

我从在线词典中剥离了一些html,我希望将其转换为XML,以便最终转换为BK树的单词列表。在线词典记录了变体拼写,但是有时通过放置可能会出现在括号中的元音或结尾来完成,例如:

<td>
    <span class="FORM">
    <span class="HDORTH">a</span>
    <span class="POS"> indef. art. </span> Also 
    <span class="ORTH">an</span>. Early forms: as subj.,
    <span class="ORTH">ane</span>, 
    <span class="ORTH">on</span>, 
    <span class="ORTH">o</span>; as obj., 
    <span class="ORTH">ane</span>, 
    <span class="ORTH">on(e</span>, 
    <span class="ORTH">o</span>, &amp; (chiefly masc.) 
    <span class="ORTH">an(n)e</span>, 
    <span class="ORTH">æn(n)e</span>, 
    <span class="ORTH">en(n)e</span>, 
    <span class="ORTH">en</span>; after prep.,chiefly 
    <span class="ORTH">ane</span>, 
    <span class="ORTH">on(e</span>, masc. also 
    <span class="ORTH">anne</span>, 
    <span class="ORTH">æn(n)e</span>, fem. also 
    <span class="ORTH">anre</span>, 
    <span class="ORTH">are</span>, 
    <span class="ORTH">hare</span>, 
    <span class="ORTH">ore</span>; gen. 
    <span class="ORTH">anes</span>, 
    <span class="ORTH">æn(n)es</span>, 
    <span class="ORTH">en(n)es</span>.</span>
</td>

我编写了以下XQuery,将HTML转换为XML,剥离标记中没有的所有内容,并根据特定范围的类选择元素:

declare function local:node-change($nodes as node()*) as node()* {
  for $span in $nodes
  return 
    if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
    else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
    else if ($span/@class = "ORTH") then <variant>{$span/text()}</variant>
    else $span
 } ;

<list>
{
let $collection:=concat($collection, '?select=*.xml')
let $q:=collection($collection)

for $y in $q 
let $s := $y/td/span/*

let $c := local:node-change($s)
(:let $l := local:stripleftparen($c):)

order by number(substring(substring-before(tokenize(document-uri($y), "/")[last()],"."),4))
return
<entry ref="{number(substring(substring-before(tokenize(document-uri($y), "/")[last()],"."),4))}">{$c}</entry>
}
</list>

这将返回以下XML:

<entry ref="3">
        <headword>a</headword>
        <part_of_speech> indef. art. </part_of_speech>
        <variant>an</variant>
        <variant>ane</variant>
        <variant>on</variant>
        <variant>o</variant>
        <variant>ane</variant>
        <variant>on(e</variant>
        <variant>o</variant>
        <variant>an(n)e</variant>
        <variant>æn(n)e</variant>
        <variant>en(n)e</variant>
        <variant>en</variant>
        <variant>ane</variant>
        <variant>on(e</variant>
        <variant>anne</variant>
        <variant>æn(n)e</variant>
        <variant>anre</variant>
        <variant>are</variant>
        <variant>hare</variant>
        <variant>ore</variant>
        <variant>anes</variant>
        <variant>æn(n)es</variant>
        <variant>en(n)es</variant>
    </entry>

我现在需要做的是克隆具有parens的节点,以便我可以修改克隆并获得以下结果,但是我不确定该怎么做。

<entry ref="3">
        <headword>a</headword>
        <part_of_speech> indef. art. </part_of_speech>
        <variant>an</variant>
        <variant>ane</variant>
        <variant>on</variant>
        <variant>o</variant>
        <variant>ane</variant>
        <variant>on</variant>
        <variant>one</variant>
        <variant>o</variant>
        <variant>ane</variant>
        <variant>anne</variant>
        <variant>æne</variant>
        <variant>ænne</variant>
        <variant>ene</variant>
        <variant>enne</variant>
        <variant>en</variant>
        <variant>ane</variant>
        <variant>on</variant>
        <variant>one</variant>
        <variant>anne</variant>
        <variant>æne</variant>
        <variant>ænne</variant>
        <variant>anre</variant>
        <variant>are</variant>
        <variant>hare</variant>
        <variant>ore</variant>
        <variant>anes</variant>
        <variant>ænes</variant>
        <variant>ænnes</variant>
        <variant>enes</variant>
        <variant>ennes</variant>
    </entry>

我知道我需要使用substring,before-before或after-substring来实际修改节点,但是在真正的克隆过程中遇到了问题。 Copy在for / return循环中不起作用,我在网上找到的所有信息都建议复制节点或谈论重复数据删除(这是我需要做的,但我想了解一下)完全按照我的意愿进行操作)。如何复制节点,修改副本并显示结果,以便获得所需的内容?

1 个答案:

答案 0 :(得分:1)

我不清楚规则是什么。但是在我看来,您应该可以在节点更改功能中一次完成所有操作。

我认为您可以按照以下方式做些事情:

      declare function local:node-change($nodes as node()*) as node()* {
      for $span in $nodes
      let $varient1 :=
        if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
        else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
        else if ($span/@class = "ORTH" and contains($span/text(),')')) then <variant>{translate($span/text(),'()','')}</variant>
        else $span
        (:Implement some if here to get the other varient if needed :)
        let $varient2 := <varient/>
      return 
        ($varient1, $varient2)
     } ;

如果有三个变体(不确定),则遵循相同的模式。当然,任何一个加法变体的else都可以是空元素,您可以在末尾去除它(即else ),然后在结果中去除任何

大概是这样(猜规则):

    xquery version "3.0";
    declare function local:node-change($nodes as node()*) as node()* {
      for $span in $nodes
      let $varient1 :=
        if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
        else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
        else if ($span/@class = "ORTH" and contains($span/text(),'(') and not(contains($span/text(),')'))) then <variant>{substring-before($span/text(),'(')}</variant>
        else if ($span/@class = "ORTH" and contains($span/text(),'(') and contains($span/text(),')')) then <variant>{concat(substring-before($span/text(),'('),substring-after($span/text(),')'))}</variant>
        else if ($span/@class = "ORTH" and not(contains($span/text(),'(')) and not(contains($span/text(),')'))) then <variant>{$span/text()}</variant>
        else $span
        let $varient2 := if ($span/@class = "ORTH" and contains($span/text(),'(') and not(contains($span/text(),')'))) then <variant>{translate($span/text(),'(','')}</variant>
        else <empty/>
        let $varient3 := if ($span/@class = "ORTH" and contains($span/text(),'(') and contains($span/text(),')')) then <variant>{translate($span/text(),'()','')}</variant>
        else <empty/>
      return 
        ($varient1, $varient2, $varient3)
     } ;
    let $cell := <td>
        <span class="FORM">
        <span class="HDORTH">a</span>
        <span class="POS"> indef. art. </span> Also 
        <span class="ORTH">an</span>. Early forms: as subj.,
        <span class="ORTH">ane</span>, 
        <span class="ORTH">on</span>, 
        <span class="ORTH">o</span>; as obj., 
        <span class="ORTH">ane</span>, 
        <span class="ORTH">on(e</span>, 
        <span class="ORTH">o</span>, &amp; (chiefly masc.) 
        <span class="ORTH">an(n)e</span>, 
        <span class="ORTH">æn(n)e</span>, 
        <span class="ORTH">en(n)e</span>, 
        <span class="ORTH">en</span>; after prep.,chiefly 
        <span class="ORTH">ane</span>, 
        <span class="ORTH">on(e</span>, masc. also 
        <span class="ORTH">anne</span>, 
        <span class="ORTH">æn(n)e</span>, fem. also 
        <span class="ORTH">anre</span>, 
        <span class="ORTH">are</span>, 
        <span class="ORTH">hare</span>, 
        <span class="ORTH">ore</span>; gen. 
        <span class="ORTH">anes</span>, 
        <span class="ORTH">æn(n)es</span>, 
        <span class="ORTH">en(n)es</span>.</span>
    </td>

    let $s := $cell/span/*
    let $c := local:node-change($s)
    return
    $c[not(local-name()='empty')]

返回此:

<headword>a</headword>
<part_of_speech> indef. art. </part_of_speech>
<variant>an</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>ene</variant>
<variant>enne</variant>
<variant>en</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>anre</variant>
<variant>are</variant>
<variant>hare</variant>
<variant>ore</variant>
<variant>anes</variant>
<variant>ænes</variant>
<variant>ænnes</variant>
<variant>enes</variant>
<variant>ennes</variant>
相关问题