我正在做一个xml->使用XSLT 2.0进行tsv转换。
我的xml文件如下所示:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE TEI.2 SYSTEM "lemmatizzazione.dtd">
<?xml-stylesheet type="text/xsl" href="dh.xsl"?>
<root>
<l>
<LM lemma="me" catg="SS">I</LM>
<LM lemma="would" catg="VV">would</LM>
<LM lemma="like" catg="VV">like</LM>
<LM lemma="to" catg="VV">to</LM>
<LM lemma="to have" catg="VV">have,</LM>
</l>
<l>
<LM lemma="this" catg="AD">this</LM>
<LM lemma="BAD" catg="E">in to</LM>
<LM lemma="a" catg="E">a</LM>
<LM lemma="ts" catg="AD">tsv</LM>
<LM lemma="for" catg="NN">format</LM>,
</l>
<l>
<LM1>
<LM lemma="but1" catg="x01">but</LM>
<LM lemma="but2" catg="x02">but</LM>
</LM1>
<LM lemma="" catg="">first</LM>
<LM lemma="" catg="">there are</LM>
<LM1>
<LM lemma="rxs" catg="fff">a</LM>
<LM lemma="tre" catg="ds">a</LM>
<LM lemma="asq" catg="rt">a</LM>
</LM1>
"<LM lemma="few" catg="E">few</LM>
</l>
<l>
<LM>problems</LM>
<LM>to</LM>
<LM>solve.</LM>
<LM>Here</LM>
<LM>and</LM>
</l>
<l>
<LM lemma="there" catg="E">there</LM>.
<LM lemma="BAD" catg="EE">This is</LM>
<LM lemma="BAD2" catg="EE">multi word</LM>
<LM lemma="good" catg="NN">expression.</LM>
</l>
</root>
所需的输出如下:
token lemma catg l's
I me SS 1
would would VV 1
like like VV 1
to to VV 1
have to have VV 1
, 1
...
format for NN 2
, 2
but but1but2 x01x02 3
我的实际XSLT是这样的:
<?xml version="1.0"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:strip-space elements="*"/>
<xsl:output method="text" encoding="UTF-8"/>
<xsl:template match="LM">
<xsl:choose>
<xsl:when test="contains(.,' ')">
<xsl:call-template name="tokenize">
<xsl:with-param name="mwe" select="current()"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:choose>
<xsl:when test="matches(substring(.,string-length(.)),'\.|,|;')">
<xsl:call-template name="remove_punctuation">
<xsl:with-param name="token" select="."/>
<xsl:with-param name="mark" select="substring(.,string-length(.))"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="." />
<xsl:call-template name="ids"/>
</xsl:otherwise>
</xsl:choose>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="LM1">
<xsl:value-of select="LM" />
<xsl:call-template name="multi_ids" />
</xsl:template>
<xsl:template name="ids">
<xsl:text>	</xsl:text>
<xsl:value-of select="./@lemma"/><xsl:text>	</xsl:text>
<xsl:value-of select="./@catg"/><xsl:text>	</xsl:text>
<xsl:value-of select="count(../preceding-sibling::l) + 1"/><xsl:text> </xsl:text>
</xsl:template>
<xsl:template name="multi_ids">
<xsl:text>	</xsl:text>
<xsl:for-each select="LM">
<xsl:value-of select="./@lemma"/>
</xsl:for-each><xsl:text>	</xsl:text>
<xsl:for-each select="LM">
<xsl:value-of select="./@catg"/>
</xsl:for-each><xsl:text>	</xsl:text>
<xsl:value-of select="count(../preceding-sibling::l) + 1"/><xsl:text> </xsl:text>
</xsl:template>
<xsl:template name="tokenize">
<xsl:param name="mwe"/>
<xsl:for-each select="tokenize($mwe/.,'\s+')">
<xsl:value-of select="normalize-space(.)"/><xsl:text>	</xsl:text>
<xsl:value-of select="$mwe/./@lemma"/><xsl:text>	</xsl:text>
<xsl:value-of select="$mwe/./@catg"/><xsl:text>	</xsl:text>
<xsl:value-of select="count($mwe/../preceding-sibling::l) + 1"/><xsl:text> </xsl:text>
</xsl:for-each>
</xsl:template>
<xsl:template name="remove_punctuation">
<xsl:param name="token"/>
<xsl:param name="mark"/>
<xsl:for-each select="tokenize($token/.,$mark)">
<xsl:value-of select="normalize-space(.)"/><xsl:text> </xsl:text>
</xsl:for-each>
</xsl:template>
<xsl:template match="/root">
<th>token</th><xsl:text>	</xsl:text>
<th>lemma</th><xsl:text>	</xsl:text>
<th>catg</th><xsl:text>	</xsl:text>
<th>l's</th><xsl:text> </xsl:text>
<xsl:for-each select="l">
<xsl:apply-templates>
</xsl:apply-templates>
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
所以,我不知道如何做两件事:
1)如何将单词与标点符号分开
<LM lemma="to have" catg="VV">have,</LM>
have to have VV 1
, 1
2)如何“标记化”节点外的文本
<LM lemma="for" catg="NN">format</LM>,
format for NN 2
, 2