我有以下XML结构:
(def xmlstr
"<ROOT>
<Items>
<Item><Type>A</Type><Note>AA</Note></Item>
<Item><Type>B</Type><Note>BB</Note></Item>
<Item><Type>C</Type><Note>CC</Note></Item>
<Item><Type>A</Type><Note>AA</Note></Item>
</Items>
</ROOT>")
我要删除任何项目,如果它有类型B或C.结果应该是这样的:
<ROOT>
<Items>
<Item><Type>A</Type><Note>AA</Note></Item>
<Item><Type>A</Type><Note>AA</Note></Item>
</Items>
</ROOT>
我发现使用data.xml和data.xml.zip查询此类结构非常简单,例如:
;; lein try org.clojure/data.xml org.clojure/data.zip
(def xmldoc (clojure.data.xml/parse-str xmlstr))
(def zipxml (clojure.zip/xml-zip xmldoc))
(clojure.data.zip.xml/xml-> zipxml :Items :Item [:Type "A"] :Note clojure.data.zip.xml/text)
;; => ("AA" "AA")
但是没有找到类似的声明性功能来删除/编辑孩子。
答案 0 :(得分:0)
下面的示例使用完整名称空间,而不是别名。解决这个问题的一种方法是使用拉链:
(defn remove-types-loc [types loc]
(loop [loc loc]
(if (clojure.zip/end? loc)
(clojure.zip/root loc)
(if (and (clojure.zip/branch? loc)
(some #(and (= (:tag %) :Type)
(contains? types (first (:content %)))) (clojure.zip/children loc)))
(recur (clojure.zip/remove loc))
(recur (clojure.zip/next loc))))))
(clojure.data.xml/emit-str (remove-types-loc #{"B" "C"} zipxml))
;; => emits the expected result, with the two Type A Items
以下使用核心功能给出了相同的结果,但是有一个新的嵌套级别和需要&#39;两个功能:
(defn remove-types-in* [remove-types content]
(update-in content [:content]
(fn [items]
(remove (fn [item]
(some #(and
(= (:tag %) :Type)
(contains? remove-types (first (:content %)))) (:content item)))
items))))
(defn remove-types-in [remove-types xmldoc]
(update-in xmldoc [:content] #(map (partial remove-types-in* remove-types) %)))
(clojure.data.xml/emit-str (remove-types-in #{"B" "C"} xmldoc))
;; => same result as above
最后,当结构固定并且像这个一样简单时,很容易手动构造结果。但是如果源获得更多的元素或属性,这将会破坏。
(clojure.data.xml/emit-str
(clojure.data.xml/sexp-as-element
[:ROOT
[:Items
(for [i (clojure.data.zip.xml/xml-> zipxml :Items :Item)
:let [t (clojure.data.zip.xml/xml1-> i :Type clojure.data.zip.xml/text)
n (clojure.data.zip.xml/xml1-> i :Note clojure.data.zip.xml/text)]
:when (not (contains? #{"B" "C"} t))]
[:Item
[:Type t]
[:Note n]])]]))
;; same as above
可能是上述更好的版本,即使项目结构发生变化也会有效:
(clojure.data.xml/emit-str
(clojure.data.xml/element
:ROOT {}
(clojure.data.xml/element
:Items {}
(for [n (xml-seq xmldoc)
:when (and
(= :Item (:tag n))
(not (some #(and (= (:tag %) :Type)
(contains? #{"B" "C"} (first (:content %))))
(:content n))))] n))))
没有找到任何oneliner这样做。不确定使用org.clojure或其他库是否有更好/更可读的方法。
对于更复杂的XML编辑,XSLT或XQuery Update可以说更像是一个原生的&#39;解。这是一个使用开源Saxon-HE S9API的快速而肮脏的XSLT 2.0解决方案:
;; lein try net.sf.saxon/Saxon-HE "9.7.0-18"
(defn remove-types-xslt [remove-types xmlstr]
(let [processor (net.sf.saxon.s9api.Processor. false)
compiler (.newXsltCompiler processor)
exp (.compile compiler (javax.xml.transform.stream.StreamSource. (java.io.StringReader. "<xsl:transform version='2.0' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'><xsl:param name='remove-types'/><xsl:template match='@*|node()'><xsl:copy><xsl:apply-templates select='@*|node()'/></xsl:copy></xsl:template><xsl:template match='Item[Type[. = $remove-types]]'/></xsl:transform>")))
src (.build (.newDocumentBuilder processor) (javax.xml.transform.stream.StreamSource. (java.io.StringReader. xmlstr)))
sw (java.io.StringWriter.)
out (doto (net.sf.saxon.s9api.Serializer.) (.setOutputWriter sw))
t (doto (.load exp) (.setInitialContextNode src) (.setDestination out) (.setParameter (net.sf.saxon.s9api.QName. "remove-types") (net.sf.saxon.s9api.XdmValue. (for [remove-type remove-types] (net.sf.saxon.s9api.XdmAtomicValue. remove-type)))) (.transform))]
sw))
(str (remove-types-xslt #{"B" "C"} xmlstr))
为了完整起见,这是一个使用XQuery Update Facility的更脏的版本。请注意,此特定示例使用Saxon-EE,因此需要付费EE许可证。
(defn remove-types-xq [remove-types xmlstr]
(let [processor (net.sf.saxon.s9api.Processor. true)
compiler (doto (.newXQueryCompiler processor) (.setUpdatingEnabled true))
exp (.compile compiler "declare variable $remove-types as xs:string+ external;delete nodes //Items/Item[Type[. = $remove-types]]")
src (.build (doto (.newDocumentBuilder processor) (.setTreeModel net.sf.saxon.om.TreeModel/LINKED_TREE)) (javax.xml.transform.stream.StreamSource. (java.io.StringReader. xmlstr)))
e (doto (.load exp) (.setContextItem src) (.setExternalVariable (net.sf.saxon.s9api.QName. "remove-types") (net.sf.saxon.s9api.XdmValue. (for [remove-type remove-types] (net.sf.saxon.s9api.XdmAtomicValue. remove-type)))) (.run))]
(when-let [res (first (iterator-seq (.getUpdatedDocuments e)))]
(let [sw (java.io.StringWriter.)
out (doto (net.sf.saxon.s9api.Serializer.) (.setOutputWriter sw))]
(.writeXdmValue processor res out)
sw))))
(str (remove-types-xq #{"B" "C"} xmlstr))
除了所有的东西,删除节点//项目/项目[类型[。 = $ remove-types]]非常简洁。
答案 1 :(得分:0)
The Tupelo library可以使用tupelo.forest
轻松解决此问题。你可以找到the API docs on GitHub Pages。以下是使用您的示例的测试用例。
在这里,我们加载您的xml数据并将其首先转换为enlive,然后转换为tree
使用的原生tupelo.forest
结构:
(ns tst.tupelo.forest-examples
(:use tupelo.forest tupelo.test )
(:require
[clojure.data.xml :as dx]
[clojure.java.io :as io]
[clojure.set :as cs]
[net.cgrand.enlive-html :as en-html]
[schema.core :as s]
[tupelo.core :as t]
[tupelo.string :as ts]))
(t/refer-tupelo)
; Discard any xml nodes of Type="A" or Type="B" (plus blank string nodes)
(dotest
(with-forest (new-forest)
(let [xml-str "<ROOT>
<Items>
<Item><Type>A</Type><Note>AA1</Note></Item>
<Item><Type>B</Type><Note>BB1</Note></Item>
<Item><Type>C</Type><Note>CC1</Note></Item>
<Item><Type>A</Type><Note>AA2</Note></Item>
</Items>
</ROOT>"
enlive-tree (->> xml-str
java.io.StringReader.
en-html/html-resource
first)
root-hid (add-tree-enlive enlive-tree)
tree-1 (hid->tree root-hid)
hid
后缀代表“Hex ID”,它是唯一的十六进制值,其作用类似于树中节点/叶子的指针。在这个阶段,我们刚刚在林数据结构中加载了数据,创建了tree-1
,如下所示:
(is= tree-1
{:attrs {:tag :ROOT},
:kids [{:attrs {:tag :tupelo.forest/raw},
:value "\n "}
{:attrs {:tag :Items},
:kids [{:attrs {:tag :tupelo.forest/raw},
:value "\n "}
{:attrs {:tag :Item},
:kids [{:attrs {:tag :Type}, :value "A"}
{:attrs {:tag :Note}, :value "AA1"}]}
{:attrs {:tag :tupelo.forest/raw},
:value "\n "}
{:attrs {:tag :Item},
:kids [{:attrs {:tag :Type}, :value "B"}
{:attrs {:tag :Note}, :value "BB1"}]}
{:attrs {:tag :tupelo.forest/raw},
:value "\n "}
{:attrs {:tag :Item},
:kids [{:attrs {:tag :Type}, :value "C"}
{:attrs {:tag :Note}, :value "CC1"}]}
{:attrs {:tag :tupelo.forest/raw},
:value "\n "}
{:attrs {:tag :Item},
:kids [{:attrs {:tag :Type}, :value "A"}
{:attrs {:tag :Note}, :value "AA2"}]}
{:attrs {:tag :tupelo.forest/raw},
:value "\n "}]}
{:attrs {:tag :tupelo.forest/raw},
:value "\n "}]})
我们接下来用以下代码删除所有空白字符串:
blank-leaf-hid? (fn [hid] (and (leaf-hid? hid) ; ensure it is a leaf node
(let [value (hid->value hid)]
(and (string? value)
(or (zero? (count value)) ; empty string
(ts/whitespace? value)))))) ; all whitespace string
blank-leaf-hids (keep-if blank-leaf-hid? (all-hids))
>> (apply remove-hid blank-leaf-hids)
tree-2 (hid->tree root-hid)
让tree-2
看起来更整洁:
(is= tree-2
{:attrs {:tag :ROOT},
:kids [{:attrs {:tag :Items},
:kids [{:attrs {:tag :Item},
:kids [{:attrs {:tag :Type}, :value "A"}
{:attrs {:tag :Note}, :value "AA1"}]}
{:attrs {:tag :Item},
:kids [{:attrs {:tag :Type}, :value "B"}
{:attrs {:tag :Note}, :value "BB1"}]}
{:attrs {:tag :Item},
:kids [{:attrs {:tag :Type}, :value "C"}
{:attrs {:tag :Note}, :value "CC1"}]}
{:attrs {:tag :Item},
:kids [{:attrs {:tag :Type}, :value "A"}
{:attrs {:tag :Note}, :value "AA2"}]}]}]})
最终代码片段删除Type =“B”或Type =“C”节点:
type-bc-hid? (fn [hid] (pos? (count (glue
(find-leaf-hids hid [:** :Type] "B")
(find-leaf-hids hid [:** :Type] "C")))))
type-bc-hids (find-hids-with root-hid [:** :Item] type-bc-hid?)
>> (apply remove-hid type-bc-hids)
tree-3 (hid->tree root-hid)
tree-3-hiccup (hid->hiccup root-hid) ]
产生以tree
格式和hiccup
格式显示的最终结果树:
(is= tree-3
{:attrs {:tag :ROOT},
:kids
[{:attrs {:tag :Items},
:kids [{:attrs {:tag :Item},
:kids [{:attrs {:tag :Type}, :value "A"}
{:attrs {:tag :Note}, :value "AA1"}]}
{:attrs {:tag :Item},
:kids [{:attrs {:tag :Type}, :value "A"}
{:attrs {:tag :Note}, :value "AA2"}]}]}]})
(is= tree-3-hiccup
[:ROOT
[:Items
[:Item [:Type "A"] [:Note "AA1"]]
[:Item [:Type "A"] [:Note "AA2"]]]]))))
可以找到完整示例in the forest-examples
unit test。
以下是删除了额外功能的最紧凑版本:
(dotest
(with-forest (new-forest)
(let [xml-str "<ROOT>
<Items>
<Item><Type>A</Type><Note>AA1</Note></Item>
<Item><Type>B</Type><Note>BB1</Note></Item>
<Item><Type>C</Type><Note>CC1</Note></Item>
<Item><Type>A</Type><Note>AA2</Note></Item>
</Items>
</ROOT>"
enlive-tree (->> xml-str
java.io.StringReader.
en-html/xml-resource
first)
root-hid (add-tree-enlive enlive-tree)
blank-leaf-hid? (fn [hid] (ts/whitespace? (hid->value hid)))
has-bc-leaf? (fn [hid] (or (has-child-leaf? hid [:** :Type] "B")
(has-child-leaf? hid [:** :Type] "C")))
blank-leaf-hids (keep-if blank-leaf-hid? (all-leaf-hids))
>> (apply remove-hid blank-leaf-hids)
bc-item-hids (find-hids-with root-hid [:** :Item] has-bc-leaf?)]
(apply remove-hid bc-item-hids)
(is= (hid->hiccup root-hid)
[:ROOT
[:Items
[:Item [:Type "A"] [:Note "AA1"]]
[:Item [:Type "A"] [:Note "AA2"]]]]))))
答案 2 :(得分:0)
Clojure标准API为操作XML和其他树结构提供了方便的功能。可以使用clojure.walk:
在深度优先遍历上完成删除(叶子)节点(require '[clojure.xml :as xml]
'[clojure.walk :as walk])
(def xmlstr
"<ROOT>
<Items>
<Item><Type>A</Type><Note>AA</Note></Item>
<Item><Type>B</Type><Note>BB</Note></Item>
<Item><Type>C</Type><Note>CC</Note></Item>
<Item><Type>A</Type><Note>AA</Note></Item>
</Items>
</ROOT>")
(def xmldoc (xml/parse (java.io.ByteArrayInputStream. (.getBytes xmlstr))))
(defn tag-matches [item tag]
(= (:tag item) tag))
(defn content-matches [item to-match]
((into #{} to-match)
(apply str (:content item))))
(defn match-criteria [item to-match]
(some #(and (tag-matches % :Type)
(content-matches % to-match))
(:content item)))
(defn mk-xml-walker [& to-remove]
(fn [form]
(if (and (vector? form)
(some #(tag-matches % :Item) form))
(filter (complement #(match-criteria % to-remove)) form)
form)))
(xml/emit (walk/postwalk (mk-xml-walker "B" "C") xmldoc))
对于神奇的单行,您可能还需要查看Specter,它提供了一种非常简洁的语法来处理嵌套数据结构,如XML。