我是Haskell的初学者。我从Project Gutenberg获得了RDF XML,如下所示:
<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF xml:base="http://www.gutenberg.org/"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:cc="http://web.resource.org/cc/"
xmlns:dcam="http://purl.org/dc/dcam/"
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
xmlns:pgterms="http://www.gutenberg.org/2009/pgterms/"
>
<cc:Work rdf:about="">
<rdfs:comment>Archives containing the RDF files for *all* our books can be downloaded at
http://www.gutenberg.org/wiki/Gutenberg:Feeds#The_Complete_Project_Gutenberg_Catalog</rdfs:comment>
<cc:license rdf:resource="https://creativecommons.org/publicdomain/zero/1.0/"/>
</cc:Work>
<pgterms:ebook rdf:about="ebooks/20">
<pgterms:bookshelf>
<rdf:Description rdf:nodeID="N3f8445072d8e4499b2646626f94866e0">
<rdf:value>Poetry</rdf:value>
<dcam:memberOf rdf:resource="2009/pgterms/Bookshelf"/>
</rdf:Description>
</pgterms:bookshelf>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/20.rdf">
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2017-03-16T05:01:13.615047</dcterms:modified>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">12133</dcterms:extent>
<dcterms:isFormatOf rdf:resource="ebooks/20"/>
<dcterms:format>
<rdf:Description rdf:nodeID="N735ba077c8424051b6470a92682aaa5e">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/rdf+xml</rdf:value>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:issued rdf:datatype="http://www.w3.org/2001/XMLSchema#date">1991-10-01</dcterms:issued>
<dcterms:title>Paradise Lost</dcterms:title>
<dcterms:subject>
<rdf:Description rdf:nodeID="Ne259525c666c4886a996acbdddca0682">
<rdf:value>PR</rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCC"/>
</rdf:Description>
</dcterms:subject>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/files/20/20.txt">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">507133</dcterms:extent>
<dcterms:isFormatOf rdf:resource="ebooks/20"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2011-03-02T06:33:54</dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="Nbd1740a2927845058b0fe43326dcc48b">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain; charset=us-ascii</rdf:value>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/20.epub.images">
<dcterms:isFormatOf rdf:resource="ebooks/20"/>
<dcterms:format>
<rdf:Description rdf:nodeID="Nb08f3d2980e64e91a402eb5b205c10bc">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/epub+zip</rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">232622</dcterms:extent>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2017-03-01T01:04:17.425321</dcterms:modified>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/20.kindle.images">
<dcterms:isFormatOf rdf:resource="ebooks/20"/>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">933970</dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="Nff1df57b9552466d96b114f20424b5a2">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/x-mobipocket-ebook</rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2017-03-01T01:04:21.321235</dcterms:modified>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:language>
<rdf:Description rdf:nodeID="N91273d0bffc74be393cda307d2b05137">
<rdf:value rdf:datatype="http://purl.org/dc/terms/RFC4646">en</rdf:value>
</rdf:Description>
</dcterms:language>
<dcterms:subject>
<rdf:Description rdf:nodeID="N5e35fb378b37483ca6ef7a08f27cf936">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCSH"/>
<rdf:value>Eve (Biblical figure) -- Poetry</rdf:value>
</rdf:Description>
</dcterms:subject>
<dcterms:license rdf:resource="license"/>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/20.html.images">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">614618</dcterms:extent>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2017-03-01T01:04:16.685338</dcterms:modified>
<dcterms:isFormatOf rdf:resource="ebooks/20"/>
<dcterms:format>
<rdf:Description rdf:nodeID="N7567260ec2fd48c0be3d2858e08ac35d">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/html</rdf:value>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/20.epub.noimages">
<dcterms:isFormatOf rdf:resource="ebooks/20"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2017-03-01T01:04:17.695324</dcterms:modified>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">232623</dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="Nb640302bc2a84a31b0e154318df817d1">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/epub+zip</rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/20.kindle.noimages">
<dcterms:isFormatOf rdf:resource="ebooks/20"/>
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">933967</dcterms:extent>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2017-03-01T01:04:24.846165</dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="N1857bba1f5484e3d84846e1a554ec593">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/x-mobipocket-ebook</rdf:value>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:publisher>Project Gutenberg</dcterms:publisher>
<dcterms:rights>Public domain in the USA.</dcterms:rights>
<dcterms:creator>
<pgterms:agent rdf:about="2009/agents/17">
<pgterms:deathdate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">1674</pgterms:deathdate>
<pgterms:webpage rdf:resource="http://en.wikipedia.org/wiki/John_Milton"/>
<pgterms:name>Milton, John</pgterms:name>
<pgterms:birthdate rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">1608</pgterms:birthdate>
</pgterms:agent>
</dcterms:creator>
<dcterms:type>
<rdf:Description rdf:nodeID="N0f6e6d76b1ff4ea9a2c5c37949efe82b">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/DCMIType"/>
<rdf:value>Text</rdf:value>
</rdf:Description>
</dcterms:type>
<dcterms:subject>
<rdf:Description rdf:nodeID="N202624c4b5994d39a3ab8bf0a2a31d95">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCSH"/>
<rdf:value>Adam (Biblical figure) -- Poetry</rdf:value>
</rdf:Description>
</dcterms:subject>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/20.html.noimages">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">614618</dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="N79f919d14da448e19eb05c444322ddd2">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/html</rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:isFormatOf rdf:resource="ebooks/20"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2017-03-01T01:04:16.955332</dcterms:modified>
</pgterms:file>
</dcterms:hasFormat>
<pgterms:bookshelf>
<rdf:Description rdf:nodeID="Nec598f664c934ed49ba3c0168ef09615">
<rdf:value>Banned Books from Anne Haight's list</rdf:value>
<dcam:memberOf rdf:resource="2009/pgterms/Bookshelf"/>
</rdf:Description>
</pgterms:bookshelf>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/ebooks/20.txt.utf-8">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">507105</dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="N069b84f8b10844e9a6c713f4c163880b">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain</rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
<dcterms:isFormatOf rdf:resource="ebooks/20"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2017-03-01T01:04:15.953358</dcterms:modified>
</pgterms:file>
</dcterms:hasFormat>
<dcterms:subject>
<rdf:Description rdf:nodeID="Nb489692851fa496d96b1a7fdf7a71b21">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCSH"/>
<rdf:value>Fall of man -- Poetry</rdf:value>
</rdf:Description>
</dcterms:subject>
<pgterms:downloads rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2088</pgterms:downloads>
<dcterms:subject>
<rdf:Description rdf:nodeID="Naa6849a7660b4039baadec8af58f0c58">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/LCSH"/>
<rdf:value>Bible. Genesis -- History of Biblical events -- Poetry</rdf:value>
</rdf:Description>
</dcterms:subject>
<dcterms:hasFormat>
<pgterms:file rdf:about="http://www.gutenberg.org/files/20/20.zip">
<dcterms:extent rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">205748</dcterms:extent>
<dcterms:format>
<rdf:Description rdf:nodeID="N19cf968278bc4922bd87b17209c20d94">
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">text/plain; charset=us-ascii</rdf:value>
</rdf:Description>
</dcterms:format>
<dcterms:isFormatOf rdf:resource="ebooks/20"/>
<dcterms:modified rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2011-03-02T06:34:42</dcterms:modified>
<dcterms:format>
<rdf:Description rdf:nodeID="N94c2881f340a49c18246b69af3abcf12">
<rdf:value rdf:datatype="http://purl.org/dc/terms/IMT">application/zip</rdf:value>
<dcam:memberOf rdf:resource="http://purl.org/dc/terms/IMT"/>
</rdf:Description>
</dcterms:format>
</pgterms:file>
</dcterms:hasFormat>
</pgterms:ebook>
<rdf:Description rdf:about="http://en.wikipedia.org/wiki/John_Milton">
<dcterms:description>Wikipedia</dcterms:description>
</rdf:Description>
</rdf:RDF>
我希望将此信息转换为可以查询和操作的常规Haskell数据结构。例如,我可能想要查询此作品的标题,或获取其所有维基百科URL。
我注意到它有an RDF library in Haskell, rdh4h并且它有一个XML解析器。但我无法对文档做出正面或反面,而且似乎并不是任何地方的教程。
我想要做的另一件事就是将所有这些RDF / XML文件导入某种数据库,然后使用Haskell以某种方式查询该数据库。但我不确定哪个数据库是合适的,或者是否可能。
当然,我可以将其视为XML数据,忽略了RDF方面,但这看起来像是一大堆工作,而且我必须为这个XML文件中的每个东西编写一些非常长的数据结构我想出去。
有没有人对如何使用Haskell查询这样的数据有任何想法?
答案 0 :(得分:4)
我注意到它有an RDF library in Haskell, rdh4h并且它有一个XML解析器。但我无法对文档做出正面或反面,并且似乎没有任何地方的教程。
这是我试图制作文档的正面或反面。 (带上一粒盐,因为我对RDF一无所知,实际上并没有尝试使用该库。)
如果我们打开顶级模块the docs for Data.RDF
,我们会发现三个看似相关的函数:parseString
,parseFile
和parseURL
。例如,The documentation for parseString
是:
parseString :: Rdf a => p -> Text -> Either ParseFailure (RDF a)
从给定文本中解析RDF,导致出现错误消息或生成的RDF。
要调用它,我们需要提供p
和Text
(要解析的字符串)。但是什么是p
?如果我们向上滚动一点,我们会注意到parseString
是RdfParser
类的一种方法。实例列表 - 有助于理解类型类 - 表明XmlParser
是RdfParser
的实例。这看起来很有用!
如果我们现在按照the XmlParser
documentation entry的链接,我们知道它有一个暴露的(或“公共的”,如果你愿意的话)构造函数:
XmlParser (Maybe BaseUrl) (Maybe Text)
我们可以进一步按照链接了解BaseUrl
只是Text
周围的新类型。但是,似乎没有关于构造函数的参数应该是什么的有用文档。除了the source code of the module之外,还有很多其他内容,也可以通过链接访问。令人惊讶的是,它揭示了与那里的功能相关的有用文档。这是RdfParser
:
-- |'XmlParser' is an instance of 'RdfParser'.
instance RdfParser XmlParser where
parseString (XmlParser bUrl dUrl) = parseXmlRDF bUrl dUrl
parseFile (XmlParser bUrl dUrl) = parseFile' bUrl dUrl
parseURL (XmlParser bUrl dUrl) = parseURL' bUrl dUrl
这里的Haddock评论是多余的;但是,Haddock对parseURL'
...
-- |Parse the document at the given location URL as an XML document, using an optional @BaseUrl@
-- as the base URI, and using the given document URL as the URI of the XML document itself.
--
-- The @BaseUrl@ is used as the base URI within the document for resolving any relative URI references.
-- It may be changed within the document using the @\@base@ directive. At any given point, the current
-- base URI is the most recent @\@base@ directive, or if none, the @BaseUrl@ given to @parseURL@, or
-- if none given, the document URL given to @parseURL@. For example, if the @BaseUrl@ were
-- @http:\/\/example.org\/@ and a relative URI of @\<b>@ were encountered (with no preceding @\@base@
-- directive), then the relative URI would expand to @http:\/\/example.org\/b@.
--
-- The document URL is for the purpose of resolving references to 'this document' within the document,
-- and may be different than the actual location URL from which the document is retrieved. Any reference
-- to @\<>@ within the document is expanded to the value given here. Additionally, if no @BaseUrl@ is
-- given and no @\@base@ directive has appeared before a relative URI occurs, this value is used as the
-- base URI against which the relative URI is resolved.
--p
-- Returns either a @ParseFailure@ or a new RDF containing the parsed triples.
parseURL' :: (Rdf a) =>
Maybe BaseUrl -- ^ The optional base URI of the document.
-> Maybe T.Text -- ^ The document URI (i.e., the URI of the document itself); if Nothing, use location URI.
-> String -- ^ The location URI from which to retrieve the XML document.
-> IO (Either ParseFailure (RDF a))
-- ^ The parse result, which is either a @ParseFailure@ or the RDF
-- corresponding to the XML document.
parseURL' bUrl docUrl = _parseURL (parseXmlRDF bUrl docUrl)
...和parseXmlRDF
:
-- |Parse a xml T.Text to an RDF representation
parseXmlRDF :: (Rdf a)
=> Maybe BaseUrl -- ^ The base URL for the RDF if required
-> Maybe T.Text -- ^ DocUrl: The request URL for the RDF if available
-> T.Text -- ^ The contents to parse
-> Either ParseFailure (RDF a) -- ^ The RDF representation of the triples or ParseFailure
parseXmlRDF bUrl dUrl xmlStr = case runParseArrow of
(_,r:_) -> Right r
_ -> Left (ParseFailure "XML parsing failed")
where runParseArrow = runSLA (xreadDoc >>> isElem >>> addMetaData bUrl dUrl >>> getRDF) initState (T.unpack xmlStr)
initState = GParseState { stateGenId = 0 }
这些Haddock注释未显示在实际文档中,因为它们所属的函数未导出。
总而言之,我会说这个库的文档可以改进。但是,在这种情况下,了解Hackage文档的方法可以减轻这种打击。
所以我尝试
parsed <- parseURL (XmlParser Nothing Nothing) testText
,但它说Ambiguous type variable ‘a0’ arising from a use of ‘parseURL’ prevents the constraint ‘(Rdf a0)’ from being solved. Probable fix: use a type annotation to specify what ‘a0’ should be
。
错误告诉您必须指定a
所在的内容...
parseURL :: Rdf a => p -> String -> IO (Either ParseFailure (RDF a))
...要么在需要具体类型的地方使用它,要么添加类型注释。
文档中的更多以下链接显示Rdf
is a class with two instances(TList
和AdjHashMap
)以及RDF
is a data family。既然如此,你想要的东西是:
parsed <- parseURL (XmlParser Nothing Nothing) testText :: IO (Either ParseFailure (RDF TList))
(注意类型注释如何与parseURL
签名指定的结果类型匹配。)
或者,启用ScopedTypeVariables
可以编写:
parsed :: Either ParseFailure (RDF TList) <- parseURL (XmlParser Nothing Nothing) testText