如何从自定义XML文件中提取图像URL?

时间:2016-11-03 06:23:02

标签: xml

我有一堆XML文件,其中包含以下标记中帖子和图像的链接:

<url>
    <loc>http://sample.com/sample-post</loc>
    <lastmod>2015-12-27T16:42:07-01:00</lastmod>
    <image:image>
        <image:loc>http://sample.com/1234/5678.jpg</image:loc>
        <image:title><![CDATA[tag1,tag2]]></image:title>
    </image:image>
    <image:image>
        <image:loc>http://sample.com/1234/5678.jpg</image:loc>
        <image:caption><![CDATA[tag1,tag2]]></image:caption>
    </image:image>
</url>

我想提取图像的链接并删除重复项,但没有href属性,所以我可以使用jQuery挂钩并提取它们。有没有办法用这个标记做到这一点?任何帮助将不胜感激。

P.S:这个问题在互联网上有多个答案,但正如我所提到的,我的XML缺少href属性。

1 个答案:

答案 0 :(得分:0)

您可能希望从link I gave使用Java中的SAX解析器。

这是一个基本骨架,可以帮助您入门 它从您的XML中提取两个URL 骨架打印出来的内容,以便您知道哪些功能可以接收。

package xmlparse;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class xmlparse {

    public static void main(String[] args) throws ParserConfigurationException,
                                                  SAXException, IOException {

        File inputFile = new File("images.xml");

        SAXParserFactory factory = SAXParserFactory.newInstance();
        SAXParser saxParser = factory.newSAXParser();

        saxParser.parse(inputFile, new MySAXHandler());
    }
}

class MySAXHandler extends DefaultHandler {
    String currentQName = "";
    List<String> imageLocList = new ArrayList<>();

    @Override
    public void startElement(String uri, String localName, 
           String qName, Attributes attributes) throws SAXException {

        System.out.println("startElement got: uri: " + uri);
        System.out.println("startElement got localName: " + uri);
        System.out.println("startElement got Name: " + qName);
        System.out.println("startElement got attributes: " + attributes);
        System.out.println();

        this.currentQName = qName; 
    }


    @Override
    public void characters (char ch[], int start, int length) {
        String s = new String(ch).substring(start, start+length).trim(); 
        System.out.format("  Received characters (s=%d, length=%s): %s%n", 
                start, length, s);

        /* ** Your handling should go here ** */
        if (currentQName.equals("image:loc")) {
            imageLocList.add(s);
        }
    }

    @Override
    public void endElement(String uri, String localName, String qName) {
        System.out.println("endElement got: uri: " + uri);
        System.out.println("endElement got localName: " + uri);
        System.out.println("endElement got qName: " + qName);
        System.out.println();

        this.currentQName = "";
    }

    @Override
    public void endDocument() {
        System.out.println("Document ended. Listing URLs:");
        imageLocList.forEach(System.out::println);
    }
}