如果Sitemap具有超过最大数量的网址,则将Sitemap拆分为更多站点地图

时间:2015-02-06 11:26:55

标签: java xml sitemap

我想将Sitemap拆分为Sitemap,如果它超过maxURLs。如果Sitemap具有多个网址,则以下示例应拆分该网站地图。

 import javax.xml.parsers.DocumentBuilder;
    import javax.xml.parsers.DocumentBuilderFactory;
    import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.CharacterData;
import org.w3c.dom.*;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
    import java.io.IOException;
    import java.io.StringReader;
    import java.util.HashSet;
    import java.util.Iterator;
    import java.util.Set;

    public class SiteMapSplitter {

    public static void main(String[] args){

            String sitemapStr = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
                    "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n" +
                    "<url>\n" +
                    "<loc>test1.html</loc>\n" +
                    "<lastmod>today</lastmod>\n" +
                    "<changefreq>daily</changefreq>\n" +
                    "<priority>1.0</priority>\n" +
                    "</url>\n" +
                    "<url>\n" +
                    "<loc>test2.html</loc>\n" +
                    "<lastmod>yesterday</lastmod>\n" +
                    "<changefreq>daily</changefreq>\n" +
                    "<priority>1.0</priority>\n" +
                    "</url></urlset>";
            try {
                splitSitemap(sitemapStr);
            } catch (ParserConfigurationException e) {
                e.printStackTrace();
            }
        }

        static private void splitSitemap(String sitemapStr) throws ParserConfigurationException {

            DocumentBuilder db = null;
            try {
                db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
            } catch (ParserConfigurationException e) {
                e.printStackTrace();
            }
            InputSource is = new InputSource();
        is.setCharacterStream(new StringReader(sitemapStr));

            Document doc = null;
            try {
                doc = db.parse(is);
            } catch (SAXException e) {
                e.printStackTrace();
            } catch (IOException e) {
                e.printStackTrace();
            }
            NodeList nodes = doc.getElementsByTagName("url");

            int maxURLs = 1;
            Set<String> smURLsSet= new HashSet<String>();
            if (nodes.getLength()>maxURLs){
                for (int i = 0; i < nodes.getLength(); i++) {
                    StringBuilder smURLsBuilder = new StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
                            "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n");
                    for (int k = 0; k<maxURLs; k++){
                        Element element = (Element) nodes.item(i);
                        smURLsBuilder.append(element);
                    }
                    smURLsSet.add(smURLsBuilder.toString());

        }
                Iterator i = smURLsSet.iterator();
                while(i.hasNext()){
                    System.out.println(i.next());
                }
            }

    }

    }

问题在于Element element = (Element) nodes.item(i); smURLsBuilder.append(element);

不会将整个元素(在本例中为url及其子绿色)附加到smURLsBuilder。这该怎么做?

1 个答案:

答案 0 :(得分:0)

您应该考虑对站点地图使用面向对象的方法。使用数据绑定(JAXB)或使用data projection更短(披露:我与该项目有关)。这样您就不需要通过字符串连接来创建XML。

public class SitemapSplitter {

    static String sitemapStr = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
            "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n" +
            "<url>\n" +
            "<loc>test1.html</loc>\n" +
            "<lastmod>today</lastmod>\n" +
            "<changefreq>daily</changefreq>\n" +
            "<priority>1.0</priority>\n" +
            "</url>\n" +
            "<url>\n" +
            "<loc>test2.html</loc>\n" +
            "<lastmod>yesterday</lastmod>\n" +
            "<changefreq>daily</changefreq>\n" +
            "<priority>1.0</priority>\n" +
            "</url></urlset>";

    public interface Sitemap {
        @XBWrite("/urlset/url")
        Sitemap setUrls(List<? extends Node> urls);
    }

    public static void main(String... args) {
        XBProjector projector = new XBProjector(Flags.TO_STRING_RENDERS_XML);
        // Get all urls from existing sitemap.
        List<Node> urlNodes = projector.onXMLString(sitemapStr).evalXPath("/xbdefaultns:urlset/xbdefaultns:url").asListOf(Node.class);
        for (Node urlNode: urlNodes) {            
            // Create a new sitemap, here with only one url
            Sitemap newSitemap = projector.onXMLString(sitemapStr).createProjection(Sitemap.class).setUrls(Collections.singletonList(urlNode));
            System.out.println(newSitemap);
        }       
    }
}

此程序打印出来

<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>test1.html</loc>
<lastmod>today</lastmod>
<changefreq>daily</changefreq>
<priority>1.0</priority>
</url>
</urlset>

<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>test2.html</loc>
<lastmod>yesterday</lastmod>
<changefreq>daily</changefreq>
<priority>1.0</priority>
</url>
</urlset>