我正在尝试从html文件中的给定字符串生成包含多个作者的XML。该字符串通过XPATH的帮助获取并存储在字符串List中。 XML是通过类构造函数生成的。
原创html
<meta name="DC.Creator" content="Gareth Jones, Alexander M. Robertson, Chawchat Santimetvirul, Peter Willett">
我需要帮助将作者分成名字和姓氏字符串,然后为每个作者生成一个预定义的XML,并结束最后一个带有结束的XML片段,&#39;&#39;
XML的片段
new XElement("author",
new XAttribute("primary_contact", "false"),
new XAttribute("include_in_browser", "true"),
new XAttribute("user_group_ref", "Authors"),
new XElement("firstname", AuthorData.FirstName),
new XElement("lastname", AuthorData.LastName),
new XElement("email", AuthorData.Email))), <-- Note the ending ), for the last author in list
如何将每个authour分成XML片段?
这是完整程序的Pastebin https://pastebin.com/sx0H7MFd
这是完整的代码(对不起,很长的帖子)
using System;
using System.Collections.Generic;
using System.Linq;
using System.Xml;
using System.Xml.Linq;
using System.Xml.Serialization;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Xml.Schema;
using HtmlAgilityPack;
namespace XML_Template_Generator
{
class Program
{
public class Global
{
// vars
public static string AllFilesExtension = "*.*";
public static string HtmlExtension = "*.html";
public static string JpgExtension = "*.jpg";
public static string GifExtension = "*.gif";
public static string PngExtension = "*.png";
public static string ArchiveDirectory = @"D:\Journals\";
public static string Directory = @"D:\Journals\1-1";
}
public class AuthorData
{
public AuthorData(string firstName, string lastName, string email)
{
FirstName = firstName;
LastName = lastName;
Email = email;
}
public static string FirstName { get; set; }
public static string LastName { get; set; }
public static string Email { get; set; }
}
public class XmlGenerator
{
public static string[] htmlFilelist = Directory.GetFiles(Global.Directory, Global.HtmlExtension);
// Issue and article specific strings
public static string Title = "";
public static string Description = "";
public static string AuthorsFirstname = AuthorData.FirstName;
public static string AuthorsLastname = AuthorData.LastName;
public static string AuthorEmail = AuthorData.Email;
public static string Publisher = "";
public static string PrimaryAuthorFirstname = "";
public static string PrimaryAuthorLastname = "";
public static string PrimaryAuthorEmail = "info@email.se";
public static string XmlConstructor(){
// XML constructor
XNamespace xsi = "http://www.w3.org/2001/XMLSchema-instance";
XDocument xmldocument = new XDocument(
new XDeclaration("1.0", "utf-8", "yes"),
new XComment("Creating the issues and articles tree for import"),
new XElement("issues",
new XElement("issue",
new XAttribute(XNamespace.Xmlns + "xsi", "http://www.w3.org/2001/XMLSchema-instance"),
new XAttribute("published", true),
new XAttribute("current", false),
new XAttribute("access_status", "1"),
new XAttribute(xsi + "schemaLocation", "http://dev.openjournal.tld native.xsd"),
new XElement("id",
new XAttribute("type", "internal"),
new XAttribute("advice", "ignore"), "4"),
new XElement("description",
new XAttribute("locale", "en_US"), Description),
new XElement("issue_identification",
new XElement("volume", 1),
new XElement("number", 1),
new XElement("year", 1995),
new XElement("title", Title,
new XAttribute("locale", "en-us"))),
new XElement("date_published", "2018-05-16"),
new XElement("last_modified", "2018-05-16"),
new XElement("sections",
new XElement("section",
new XAttribute("ref", "ART"),
new XAttribute("seq", 0),
new XAttribute("editor_restricted", 0),
new XAttribute("meta_indexed", 1),
new XAttribute("abstracts_not_required", 0),
new XAttribute("hide_title", 0),
new XAttribute("hide_author", 0),
new XAttribute("abstract_word_count", 0)),
new XElement("id",
new XAttribute("type", "internal"),
new XAttribute("advice", "ignore")),
new XElement("abbrev",
new XAttribute("locale", "en_US"), "ART"),
new XElement("title",
new XAttribute("locale", "en_US"), "Artiklar")),
new XElement("issue_covers",
new XElement("cover",
new XAttribute("locale", "en_US")),
new XElement("cover_image", "cover_issue_4_en_US.jpg"),
new XElement("cover_image_alt_text")),
new XElement("issue_galleys",
new XAttribute(XNamespace.Xmlns + "xsi", "http://www.w3.org/2001/XMLSchema-instance"),
new XAttribute(xsi + "schemaLocation", "http://dev.openjournal.tld native.xsd"),
new XElement("issue_galley",
new XAttribute("locale", "en_US"),
new XElement("label", "Paper1"),
from f in htmlFilelist
select new XElement("issue_file",
new XElement("file_name", f),
new XElement("file_type", "text/html"),
new XElement("file_size", "FILE SIZE"),
new XElement("file_size", "FILE SIZE"),
new XElement("content_type", 1),
new XElement("original_file_name", "FILE NAME"),
new XElement("date_uploaded", "2018-05-16"),
new XElement("date_modified", "2018-05-16")))),
new XElement("articles",
new XAttribute(XNamespace.Xmlns + "xsi", "http://www.w3.org/2001/XMLSchema-instance"),
new XAttribute(xsi + "schemaLocation", "http://dev.openjournal.tld native.xsd"),
new XElement("article",
new XAttribute(XNamespace.Xmlns + "xsi",
"http://www.w3.org/2001/XMLSchema-instance"),
new XAttribute("locale", "en_US"),
new XAttribute("date_submitted", DateTime.Now.ToString("yyyy-MM-dd")),
new XAttribute("stage", "production"),
new XAttribute("date_published", "1995-01-01"),
new XAttribute("section_ref", "ART"),
new XAttribute("seq", 1),
new XAttribute("access_status", 0),
new XElement("id",
new XAttribute("type", "internal"),
new XAttribute("advice", "ignore"), 5),
new XElement("title",
new XAttribute("locale", "en_US"), Title),
new XElement("abstract",
new XAttribute("locale", "en_US"), Description),
new XElement("licenseUrl", "http://creativecommons.org/licenses/by-nc-nd/4.0"),
new XElement("copyrightHolder",
new XAttribute("locale", "en_US"), "INSERT NAME OF COPYRIGHT HOLDER HERE"),
new XElement("copyrightYear", "INSERT YEAR HERE"),
new XElement("keywords",
new XAttribute("locale", "en_US"),
new XElement("keyword", "HOW DO I GET MULTIPLE KEYWORDS?")),
new XElement("authors",
new XAttribute(XNamespace.Xmlns + "xsi",
"http://www.w3.org/2001/XMLSchema-instance"),
new XAttribute(xsi + "schemaLocation", "http://dev.openjournal.tld native.xsd"),
new XElement("author",
new XAttribute("primary_contact", "true"),
new XAttribute("include_in_browser", "true"),
new XAttribute("user_group_ref", "Authors"),
new XElement("firstname", PrimaryAuthorFirstname),
new XElement("lastname", PrimaryAuthorLastname),
new XElement("email", PrimaryAuthorEmail)),
new XElement("author",
new XAttribute("primary_contact", "false"),
new XAttribute("include_in_browser", "true"),
new XAttribute("user_group_ref", "Authors"),
new XElement("firstname", AuthorData.FirstName),
new XElement("lastname", AuthorData.LastName),
new XElement("email", AuthorData.Email))),
new XElement("submission_file",
new XAttribute(XNamespace.Xmlns + "xsi",
"http://www.w3.org/2001/XMLSchema-instance"),
new XAttribute("id", "INSERT ID HERE"),
new XAttribute(xsi + "schemaLocation", "http://dev.openjournal.tld native.xsd"),
new XElement("revision",
new XAttribute("number", "1"),
new XAttribute("genre", "ARTIKELTEXT"),
new XAttribute("filename", "INSERT FILENAME HERE"),
new XAttribute("date_uploaded", "INSERT DATE FOR UPLOAD HERE"),
new XAttribute("date_modified", "INSERT DATE FOR LAST MODIFICATION HERE"),
new XAttribute("filesize", "INSERT FILE SIZE HERE (MIGHT BE OPTIONAL)"),
new XAttribute("filetype", "INSERT TYPE OF FILE text/html OR PICTURES"),
new XAttribute("user_group", "Authors"),
new XAttribute("uploader", "INSERT FIRST AUTHOR AS UPLOADER HERE"),
new XElement("name",
new XAttribute("locale", "en_US"), "AUTHORSNAME, USER_GROUP, FILENAME"),
new XElement("href",
new XAttribute("src", "http://localhost/importfolder/papers.html"),
new XAttribute("mime_type", "text/url"))
)))))));
return xmldocument.ToString();
}
}
static void Main(string[] args)
{
// list all directory paths. Use in a foreach-loop to select each directory.
string[] dirs = Directory.GetDirectories(Global.ArchiveDirectory, Global.AllFilesExtension, SearchOption.TopDirectoryOnly);
// List all files and types in invidual lists
string[] htmlFilelist = XmlGenerator.htmlFilelist;
var jpgFilelist = Getfile(Global.Directory, Global.JpgExtension);
var gifFilelist = Getfile(Global.Directory, Global.GifExtension);
var pngFilelist = Getfile(Global.Directory, Global.PngExtension);
// Read every file in list and find information with XPath
foreach (string file in htmlFilelist)
{
HtmlDocument doc = new HtmlDocument();
doc.Load(file);
HtmlNodeNavigator navigator = (HtmlNodeNavigator) doc.CreateNavigator();
// Get the title for the file
string title_xpath = "//meta[@name='DC.Title']/@content";
XmlGenerator.Title = navigator.SelectSingleNode(title_xpath).Value;
// Get the description of the document
string description_xpath = "//meta[@name='DC.Description']/@content";
XmlGenerator.Description = navigator.SelectSingleNode(description_xpath).Value;
// Get the author or authors in to a list
string author_xpath = "//meta[@name='DC.Creator']/@content";
var authors = new List<string>();
string[] author = navigator.SelectSingleNode(author_xpath).Value.Split(',');
foreach (var a in author)
{
authors.Add(a);
}
// Split their names into first and lastname
var authorName = authors.First().Split(' ');
// Get the publisher meta tag
string publisher_xpath = "//meta[@name='DC.Publisher']/@content";
XmlGenerator.Publisher = navigator.SelectSingleNode(publisher_xpath).Value;
// Test for co-authors names
AuthorData.FirstName = "Kalle";
AuthorData.LastName = "Andersson";
AuthorData.Email = "authors@email.com";
// Test with output to console
Console.WriteLine("*** Authors ***");
foreach (var a in authors)
{
Console.WriteLine(a);
}
Console.WriteLine("File: {0}", file);
Console.WriteLine("Title: {0}", XmlGenerator.Title);
Console.WriteLine();
Console.WriteLine("Description: {0}", XmlGenerator.Description);
Console.WriteLine();
Console.WriteLine("Primary Author: {0} {1}", authorName[0], authorName[1]);
Console.WriteLine("Primary Author Email: {0}", XmlGenerator.PrimaryAuthorEmail);
Console.WriteLine("Publisher: {0}", XmlGenerator.Publisher);
Console.WriteLine("********");
}
// Output to console for testing
Console.WriteLine("Total number of directories: {0}", dirs.Count());
Console.WriteLine("Number of html-files: {0}", htmlFilelist.Count());
Console.WriteLine("Number of jpg-files: {0}", jpgFilelist.Count());
Console.WriteLine("Number of gif-files: {0}", gifFilelist.Count());
Console.WriteLine("Number of png-files: {0}", pngFilelist.Count());
Console.WriteLine("*** XML ***");
Console.WriteLine(XmlGenerator.XmlConstructor());
Console.ReadKey();
}
private static IEnumerable<string> Getfile(string fdir, string ext)
{
List<string> files = new List<string>();
try
{
files.AddRange(Directory.GetFiles(fdir, ext, SearchOption.AllDirectories));
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
return files;
}
static List<string> build_directory_list(string dir)
{
// Get all subdirectories
string[] subdirectories = Directory.GetDirectories(dir);
List<string> directories = new List<string>();
foreach (string directory in subdirectories)
{
directories.Add(directory);
}
return directories;
}
}
}
答案 0 :(得分:1)
尝试以下操作:
string authors = "Gareth Jones, Alexander M. Robertson, Chawchat Santimetvirul, Peter Willett";
string[] authorArray = authors.Split(new char[] { ',' }).ToArray();
XElement xAuthors = new XElement("Authors");
foreach (string author in authorArray)
{
XElement xAuthor = new XElement("author",
new XAttribute("primary_contact", "false"),
new XAttribute("include_in_browser", "true"),
new XAttribute("user_group_ref", "Authors"),
author
);
xAuthors.Add(xAuthor);
}
答案 1 :(得分:1)
我假设您已经有了带有作者姓名的字符串。这是使用linq查询获取所有作者的一种方法。我将它们包装在一个包含XElement
的文件中,以符合xml标准,但是如果您愿意,可以将它们分开进行。
var content = "Gareth Jones, Alexander M. Robertson, Chawchat Santimetvirul, Peter Willett";
var names = content.Split(',');
Array.ForEach(names, Console.WriteLine);
var authors = new XElement("authors",
names.Select(x => new XElement("author",
new XAttribute("primary_contact", "false"),
new XAttribute("include_in_browser", "true"),
new XAttribute("user_group_reg", "Authors"),
new XElement("firstname", x.Substring(0, x.LastIndexOf(' ') + 1).Trim()),
new XElement("lastname", x.Substring(x.LastIndexOf(' ') + 1)),
new XElement("email", "dude@work.com")
)));
更新
我将原始答案保留在原位(并且我又回到了PC上,以便可以详细答复)。考虑到您在下面的评论中的新要求,我已经包括了如何给出单个作者元素的列表,以及如何使第一个成为与该属性设置为false的其他元素的主要联系。请参见以下代码:
var authorCount = 1; // Used to determine whether this is the primary contact.
IEnumerable<XElement> authors = names
.Select(x => new XElement("author",
new XAttribute("primary_contact", authorCount++ == 1),
new XAttribute("include_in_browser", true),
new XAttribute("user_group_reg", "Authors"),
new XElement("firstname", x.Substring(0, x.LastIndexOf(' ')).Trim()),
new XElement("lastname", x.Substring(x.LastIndexOf(' ') + 1)),
new XElement("email", "dude@work.com")
));
// Display to console.
Array.ForEach(authors.ToArray(), Console.WriteLine);
这应该为您提供指定格式的列表。