我从在线词典中剥离了一些html,我希望将其转换为XML,以便最终转换为BK树的单词列表。在线词典记录了变体拼写,但是有时通过放置可能会出现在括号中的元音或结尾来完成,例如:
<td>
<span class="FORM">
<span class="HDORTH">a</span>
<span class="POS"> indef. art. </span> Also
<span class="ORTH">an</span>. Early forms: as subj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on</span>,
<span class="ORTH">o</span>; as obj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>,
<span class="ORTH">o</span>, & (chiefly masc.)
<span class="ORTH">an(n)e</span>,
<span class="ORTH">æn(n)e</span>,
<span class="ORTH">en(n)e</span>,
<span class="ORTH">en</span>; after prep.,chiefly
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>, masc. also
<span class="ORTH">anne</span>,
<span class="ORTH">æn(n)e</span>, fem. also
<span class="ORTH">anre</span>,
<span class="ORTH">are</span>,
<span class="ORTH">hare</span>,
<span class="ORTH">ore</span>; gen.
<span class="ORTH">anes</span>,
<span class="ORTH">æn(n)es</span>,
<span class="ORTH">en(n)es</span>.</span>
</td>
我编写了以下XQuery,将HTML转换为XML,剥离标记中没有的所有内容,并根据特定范围的类选择元素:
declare function local:node-change($nodes as node()*) as node()* {
for $span in $nodes
return
if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
else if ($span/@class = "ORTH") then <variant>{$span/text()}</variant>
else $span
} ;
<list>
{
let $collection:=concat($collection, '?select=*.xml')
let $q:=collection($collection)
for $y in $q
let $s := $y/td/span/*
let $c := local:node-change($s)
(:let $l := local:stripleftparen($c):)
order by number(substring(substring-before(tokenize(document-uri($y), "/")[last()],"."),4))
return
<entry ref="{number(substring(substring-before(tokenize(document-uri($y), "/")[last()],"."),4))}">{$c}</entry>
}
</list>
这将返回以下XML:
<entry ref="3">
<headword>a</headword>
<part_of_speech> indef. art. </part_of_speech>
<variant>an</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>on(e</variant>
<variant>o</variant>
<variant>an(n)e</variant>
<variant>æn(n)e</variant>
<variant>en(n)e</variant>
<variant>en</variant>
<variant>ane</variant>
<variant>on(e</variant>
<variant>anne</variant>
<variant>æn(n)e</variant>
<variant>anre</variant>
<variant>are</variant>
<variant>hare</variant>
<variant>ore</variant>
<variant>anes</variant>
<variant>æn(n)es</variant>
<variant>en(n)es</variant>
</entry>
我现在需要做的是克隆具有parens的节点,以便我可以修改克隆并获得以下结果,但是我不确定该怎么做。
<entry ref="3">
<headword>a</headword>
<part_of_speech> indef. art. </part_of_speech>
<variant>an</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>ene</variant>
<variant>enne</variant>
<variant>en</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>anre</variant>
<variant>are</variant>
<variant>hare</variant>
<variant>ore</variant>
<variant>anes</variant>
<variant>ænes</variant>
<variant>ænnes</variant>
<variant>enes</variant>
<variant>ennes</variant>
</entry>
我知道我需要使用substring,before-before或after-substring来实际修改节点,但是在真正的克隆过程中遇到了问题。 Copy
在for / return循环中不起作用,我在网上找到的所有信息都建议复制节点或谈论重复数据删除(这是我需要做的,但我想了解一下)完全按照我的意愿进行操作)。如何复制节点,修改副本并显示结果,以便获得所需的内容?
答案 0 :(得分:1)
我不清楚规则是什么。但是在我看来,您应该可以在节点更改功能中一次完成所有操作。
我认为您可以按照以下方式做些事情:
declare function local:node-change($nodes as node()*) as node()* {
for $span in $nodes
let $varient1 :=
if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
else if ($span/@class = "ORTH" and contains($span/text(),')')) then <variant>{translate($span/text(),'()','')}</variant>
else $span
(:Implement some if here to get the other varient if needed :)
let $varient2 := <varient/>
return
($varient1, $varient2)
} ;
如果有三个变体(不确定),则遵循相同的模式。当然,任何一个加法变体的else都可以是空元素,您可以在末尾去除它(即else
大概是这样(猜规则):
xquery version "3.0";
declare function local:node-change($nodes as node()*) as node()* {
for $span in $nodes
let $varient1 :=
if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
else if ($span/@class = "ORTH" and contains($span/text(),'(') and not(contains($span/text(),')'))) then <variant>{substring-before($span/text(),'(')}</variant>
else if ($span/@class = "ORTH" and contains($span/text(),'(') and contains($span/text(),')')) then <variant>{concat(substring-before($span/text(),'('),substring-after($span/text(),')'))}</variant>
else if ($span/@class = "ORTH" and not(contains($span/text(),'(')) and not(contains($span/text(),')'))) then <variant>{$span/text()}</variant>
else $span
let $varient2 := if ($span/@class = "ORTH" and contains($span/text(),'(') and not(contains($span/text(),')'))) then <variant>{translate($span/text(),'(','')}</variant>
else <empty/>
let $varient3 := if ($span/@class = "ORTH" and contains($span/text(),'(') and contains($span/text(),')')) then <variant>{translate($span/text(),'()','')}</variant>
else <empty/>
return
($varient1, $varient2, $varient3)
} ;
let $cell := <td>
<span class="FORM">
<span class="HDORTH">a</span>
<span class="POS"> indef. art. </span> Also
<span class="ORTH">an</span>. Early forms: as subj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on</span>,
<span class="ORTH">o</span>; as obj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>,
<span class="ORTH">o</span>, & (chiefly masc.)
<span class="ORTH">an(n)e</span>,
<span class="ORTH">æn(n)e</span>,
<span class="ORTH">en(n)e</span>,
<span class="ORTH">en</span>; after prep.,chiefly
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>, masc. also
<span class="ORTH">anne</span>,
<span class="ORTH">æn(n)e</span>, fem. also
<span class="ORTH">anre</span>,
<span class="ORTH">are</span>,
<span class="ORTH">hare</span>,
<span class="ORTH">ore</span>; gen.
<span class="ORTH">anes</span>,
<span class="ORTH">æn(n)es</span>,
<span class="ORTH">en(n)es</span>.</span>
</td>
let $s := $cell/span/*
let $c := local:node-change($s)
return
$c[not(local-name()='empty')]
返回此:
<headword>a</headword>
<part_of_speech> indef. art. </part_of_speech>
<variant>an</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>ene</variant>
<variant>enne</variant>
<variant>en</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>anre</variant>
<variant>are</variant>
<variant>hare</variant>
<variant>ore</variant>
<variant>anes</variant>
<variant>ænes</variant>
<variant>ænnes</variant>
<variant>enes</variant>
<variant>ennes</variant>