使用Clojure删除特定的XML节点

时间:2017-06-07 20:15:48

标签: xml clojure

我有以下XML结构:

(def xmlstr
"<ROOT>
  <Items>
    <Item><Type>A</Type><Note>AA</Note></Item>
    <Item><Type>B</Type><Note>BB</Note></Item>
    <Item><Type>C</Type><Note>CC</Note></Item>
    <Item><Type>A</Type><Note>AA</Note></Item>
  </Items>
</ROOT>")

我要删除任何项目,如果它有类型B或C.结果应该是这样的:

<ROOT>
  <Items>
    <Item><Type>A</Type><Note>AA</Note></Item>
    <Item><Type>A</Type><Note>AA</Note></Item>
  </Items>
</ROOT>

我发现使用data.xml和data.xml.zip查询此类结构非常简单,例如:

;; lein try org.clojure/data.xml org.clojure/data.zip
(def xmldoc (clojure.data.xml/parse-str xmlstr))
(def zipxml (clojure.zip/xml-zip xmldoc))

(clojure.data.zip.xml/xml-> zipxml :Items :Item [:Type "A"] :Note clojure.data.zip.xml/text)
;; => ("AA" "AA")

但是没有找到类似的声明性功能来删除/编辑孩子。

3 个答案:

答案 0 :(得分:0)

下面的示例使用完整名称空间,而不是别名。解决这个问题的一种方法是使用拉链:

(defn remove-types-loc [types loc]
  (loop [loc loc]
    (if (clojure.zip/end? loc)
      (clojure.zip/root loc)
      (if (and (clojure.zip/branch? loc)
               (some #(and (= (:tag %) :Type)
                           (contains? types (first (:content %)))) (clojure.zip/children loc)))
        (recur (clojure.zip/remove loc))
        (recur (clojure.zip/next loc))))))

(clojure.data.xml/emit-str (remove-types-loc #{"B" "C"} zipxml))
;; => emits the expected result, with the two Type A Items

以下使用核心功能给出了相同的结果,但是有一个新的嵌套级别和需要&#39;两个功能:

(defn remove-types-in* [remove-types content]
  (update-in content [:content]
             (fn [items]
               (remove (fn [item]
                         (some #(and
                                 (= (:tag %) :Type)
                                 (contains? remove-types (first (:content %)))) (:content item)))
                items))))

(defn remove-types-in [remove-types xmldoc]
  (update-in xmldoc [:content] #(map (partial remove-types-in* remove-types) %)))

(clojure.data.xml/emit-str (remove-types-in #{"B" "C"} xmldoc))
;; => same result as above

最后,当结构固定并且像这个一样简单时,很容易手动构造结果。但是如果源获得更多的元素或属性,这将会破坏。

(clojure.data.xml/emit-str
 (clojure.data.xml/sexp-as-element
  [:ROOT
   [:Items
    (for [i (clojure.data.zip.xml/xml-> zipxml :Items :Item)
          :let [t (clojure.data.zip.xml/xml1-> i :Type clojure.data.zip.xml/text)
                n (clojure.data.zip.xml/xml1-> i :Note clojure.data.zip.xml/text)]
          :when (not (contains? #{"B" "C"} t))]
      [:Item
       [:Type t]
       [:Note n]])]]))
;; same as above

可能是上述更好的版本,即使项目结构发生变化也会有效:

(clojure.data.xml/emit-str
 (clojure.data.xml/element
  :ROOT {}
  (clojure.data.xml/element
   :Items {}
   (for [n (xml-seq xmldoc)
         :when (and
                (= :Item (:tag n))
                (not (some #(and (= (:tag %) :Type)
                                 (contains? #{"B" "C"} (first (:content %))))
                           (:content n))))] n))))

没有找到任何oneliner这样做。不确定使用org.clojure或其他库是否有更好/更可读的方法。

对于更复杂的XML编辑,XSLT或XQuery Update可以说更像是一个原生的&#39;解。这是一个使用开源Saxon-HE S9API的快速而肮脏的XSLT 2.0解决方案:

;; lein try net.sf.saxon/Saxon-HE "9.7.0-18"
(defn remove-types-xslt [remove-types xmlstr]
  (let [processor (net.sf.saxon.s9api.Processor. false)
         compiler (.newXsltCompiler processor)
         exp (.compile compiler (javax.xml.transform.stream.StreamSource. (java.io.StringReader. "<xsl:transform version='2.0' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'><xsl:param name='remove-types'/><xsl:template match='@*|node()'><xsl:copy><xsl:apply-templates select='@*|node()'/></xsl:copy></xsl:template><xsl:template match='Item[Type[. = $remove-types]]'/></xsl:transform>")))
         src (.build (.newDocumentBuilder processor) (javax.xml.transform.stream.StreamSource. (java.io.StringReader. xmlstr)))
         sw (java.io.StringWriter.)
         out (doto (net.sf.saxon.s9api.Serializer.) (.setOutputWriter sw))
         t (doto (.load exp) (.setInitialContextNode src) (.setDestination out) (.setParameter (net.sf.saxon.s9api.QName. "remove-types") (net.sf.saxon.s9api.XdmValue. (for [remove-type remove-types] (net.sf.saxon.s9api.XdmAtomicValue. remove-type)))) (.transform))]
    sw))
(str (remove-types-xslt #{"B" "C"} xmlstr))

为了完整起见,这是一个使用XQuery Update Facility的更脏的版本。请注意,此特定示例使用Saxon-EE,因此需要付费EE许可证。

(defn remove-types-xq [remove-types xmlstr]
  (let [processor (net.sf.saxon.s9api.Processor. true)
        compiler (doto (.newXQueryCompiler processor) (.setUpdatingEnabled true))
        exp (.compile compiler "declare variable $remove-types as xs:string+ external;delete nodes //Items/Item[Type[. = $remove-types]]")
        src (.build (doto (.newDocumentBuilder processor) (.setTreeModel net.sf.saxon.om.TreeModel/LINKED_TREE)) (javax.xml.transform.stream.StreamSource. (java.io.StringReader. xmlstr)))
        e (doto (.load exp) (.setContextItem src) (.setExternalVariable (net.sf.saxon.s9api.QName. "remove-types") (net.sf.saxon.s9api.XdmValue. (for [remove-type remove-types] (net.sf.saxon.s9api.XdmAtomicValue. remove-type)))) (.run))]
    (when-let [res (first (iterator-seq (.getUpdatedDocuments e)))]
      (let [sw (java.io.StringWriter.)
            out (doto (net.sf.saxon.s9api.Serializer.) (.setOutputWriter sw))]
        (.writeXdmValue processor res out)
        sw))))
(str (remove-types-xq #{"B" "C"} xmlstr))

除了所有的东西,删除节点//项目/项目[类型[。 = $ remove-types]]非常简洁。

答案 1 :(得分:0)

The Tupelo library可以使用tupelo.forest轻松解决此问题。你可以找到the API docs on GitHub Pages。以下是使用您的示例的测试用例。

在这里,我们加载您的xml数据并将其首先转换为enlive,然后转换为tree使用的原生tupelo.forest结构:

(ns tst.tupelo.forest-examples
  (:use tupelo.forest tupelo.test )
  (:require
    [clojure.data.xml :as dx]
    [clojure.java.io :as io]
    [clojure.set :as cs]
    [net.cgrand.enlive-html :as en-html]
    [schema.core :as s]
    [tupelo.core :as t]
    [tupelo.string :as ts]))
(t/refer-tupelo)

; Discard any xml nodes of Type="A" or Type="B" (plus blank string nodes)
(dotest
  (with-forest (new-forest)
    (let [xml-str         "<ROOT>
                            <Items>
                              <Item><Type>A</Type><Note>AA1</Note></Item>
                              <Item><Type>B</Type><Note>BB1</Note></Item>
                              <Item><Type>C</Type><Note>CC1</Note></Item>
                              <Item><Type>A</Type><Note>AA2</Note></Item>
                            </Items>
                          </ROOT>"
          enlive-tree     (->> xml-str
                            java.io.StringReader.
                            en-html/html-resource
                            first)
          root-hid        (add-tree-enlive enlive-tree)
          tree-1          (hid->tree root-hid)

hid后缀代表“Hex ID”,它是唯一的十六进制值,其作用类似于树中节点/叶子的指针。在这个阶段,我们刚刚在林数据结构中加载了数据,创建了tree-1,如下所示:

 (is= tree-1
   {:attrs {:tag :ROOT},
    :kids  [{:attrs {:tag :tupelo.forest/raw},
             :value "\n                            "}
            {:attrs {:tag :Items},
             :kids  [{:attrs {:tag :tupelo.forest/raw},
                      :value "\n                              "}
                     {:attrs {:tag :Item},
                      :kids  [{:attrs {:tag :Type}, :value "A"}
                              {:attrs {:tag :Note}, :value "AA1"}]}
                     {:attrs {:tag :tupelo.forest/raw},
                      :value "\n                              "}
                     {:attrs {:tag :Item},
                      :kids  [{:attrs {:tag :Type}, :value "B"}
                              {:attrs {:tag :Note}, :value "BB1"}]}
                     {:attrs {:tag :tupelo.forest/raw},
                      :value "\n                              "}
                     {:attrs {:tag :Item},
                      :kids  [{:attrs {:tag :Type}, :value "C"}
                              {:attrs {:tag :Note}, :value "CC1"}]}
                     {:attrs {:tag :tupelo.forest/raw},
                      :value "\n                              "}
                     {:attrs {:tag :Item},
                      :kids  [{:attrs {:tag :Type}, :value "A"}
                              {:attrs {:tag :Note}, :value "AA2"}]}
                     {:attrs {:tag :tupelo.forest/raw},
                      :value "\n                            "}]}
            {:attrs {:tag :tupelo.forest/raw},
             :value "\n                          "}]})

我们接下来用以下代码删除所有空白字符串:

blank-leaf-hid? (fn [hid] (and (leaf-hid? hid) ; ensure it is a leaf node
                            (let [value (hid->value hid)]
                              (and (string? value)
                                (or (zero? (count value)) ; empty string
                                  (ts/whitespace? value)))))) ; all whitespace string

blank-leaf-hids (keep-if blank-leaf-hid? (all-hids))
>>              (apply remove-hid blank-leaf-hids)
tree-2          (hid->tree root-hid)

tree-2看起来更整洁:

(is= tree-2
  {:attrs {:tag :ROOT},
   :kids  [{:attrs {:tag :Items},
            :kids  [{:attrs {:tag :Item},
                     :kids  [{:attrs {:tag :Type}, :value "A"}
                             {:attrs {:tag :Note}, :value "AA1"}]}
                    {:attrs {:tag :Item},
                     :kids  [{:attrs {:tag :Type}, :value "B"}
                             {:attrs {:tag :Note}, :value "BB1"}]}
                    {:attrs {:tag :Item},
                     :kids  [{:attrs {:tag :Type}, :value "C"}
                             {:attrs {:tag :Note}, :value "CC1"}]}
                    {:attrs {:tag :Item},
                     :kids  [{:attrs {:tag :Type}, :value "A"}
                             {:attrs {:tag :Note}, :value "AA2"}]}]}]})

最终代码片段删除Type =“B”或Type =“C”节点:

type-bc-hid?    (fn [hid] (pos? (count (glue
                            (find-leaf-hids hid [:** :Type] "B")
                            (find-leaf-hids hid [:** :Type] "C")))))

type-bc-hids    (find-hids-with root-hid [:** :Item] type-bc-hid?)
>>              (apply remove-hid type-bc-hids)
tree-3          (hid->tree root-hid)
tree-3-hiccup   (hid->hiccup root-hid) ]

产生以tree格式和hiccup格式显示的最终结果树:

(is= tree-3
  {:attrs {:tag :ROOT},
   :kids
          [{:attrs {:tag :Items},
            :kids  [{:attrs {:tag :Item},
                     :kids  [{:attrs {:tag :Type}, :value "A"}
                             {:attrs {:tag :Note}, :value "AA1"}]}
                    {:attrs {:tag :Item},
                     :kids  [{:attrs {:tag :Type}, :value "A"}
                             {:attrs {:tag :Note}, :value "AA2"}]}]}]})
(is= tree-3-hiccup
  [:ROOT
   [:Items
    [:Item [:Type "A"] [:Note "AA1"]]
    [:Item [:Type "A"] [:Note "AA2"]]]]))))

可以找到完整示例in the forest-examples unit test

更新

以下是删除了额外功能的最紧凑版本:

(dotest
  (with-forest (new-forest)
    (let [xml-str         "<ROOT>
                            <Items>
                              <Item><Type>A</Type><Note>AA1</Note></Item>
                              <Item><Type>B</Type><Note>BB1</Note></Item>
                              <Item><Type>C</Type><Note>CC1</Note></Item>
                              <Item><Type>A</Type><Note>AA2</Note></Item>
                            </Items>
                          </ROOT>"
          enlive-tree     (->> xml-str
                            java.io.StringReader.
                            en-html/xml-resource
                            first)
          root-hid        (add-tree-enlive enlive-tree)
          blank-leaf-hid? (fn [hid] (ts/whitespace? (hid->value hid)))
          has-bc-leaf?    (fn [hid] (or (has-child-leaf? hid [:** :Type] "B")
                                        (has-child-leaf? hid [:** :Type] "C")))
          blank-leaf-hids (keep-if blank-leaf-hid? (all-leaf-hids))
          >>              (apply remove-hid blank-leaf-hids)
          bc-item-hids    (find-hids-with root-hid [:** :Item] has-bc-leaf?)]
      (apply remove-hid bc-item-hids)
      (is= (hid->hiccup root-hid)
        [:ROOT
         [:Items
          [:Item [:Type "A"] [:Note "AA1"]]
          [:Item [:Type "A"] [:Note "AA2"]]]]))))

答案 2 :(得分:0)

Clojure标准API为操作XML和其他树结构提供了方便的功能。可以使用clojure.walk

在深度优先遍历上完成删除(叶子)节点
(require '[clojure.xml :as xml]
         '[clojure.walk :as walk])

(def xmlstr
"<ROOT>
  <Items>
    <Item><Type>A</Type><Note>AA</Note></Item>
    <Item><Type>B</Type><Note>BB</Note></Item>
    <Item><Type>C</Type><Note>CC</Note></Item>
    <Item><Type>A</Type><Note>AA</Note></Item>
  </Items>
</ROOT>")

(def xmldoc (xml/parse (java.io.ByteArrayInputStream. (.getBytes xmlstr))))

(defn tag-matches [item tag]
  (= (:tag item) tag))

(defn content-matches [item to-match]
  ((into #{} to-match)
   (apply str (:content item))))

(defn match-criteria [item to-match]
  (some #(and (tag-matches % :Type)
              (content-matches % to-match))
        (:content item)))

(defn mk-xml-walker [& to-remove]
  (fn [form]
    (if (and (vector? form)
             (some #(tag-matches % :Item) form))
      (filter (complement #(match-criteria % to-remove)) form)
      form)))

(xml/emit (walk/postwalk (mk-xml-walker "B" "C") xmldoc))

对于神奇的单行,您可能还需要查看Specter,它提供了一种非常简洁的语法来处理嵌套数据结构,如XML。