我有一个XQuery,它返回我输入目录中不在BasicLatin unicode块中的不同字符集。我不想返回字符本身,而是希望查询返回它们所在的unicode块的名称(例如“Latin-1 Supplement”而不是当前的“ä”(变音符号))。有没有办法做到这一点?
xquery version "1.0";
declare namespace ead = "urn:isbn:1-931666-22-9";
declare default element namespace "urn:isbn:1-931666-22-9";
declare copy-namespaces no-preserve, inherit;
import module namespace functx="http://www.functx.com"
at "http://www.xqueryfunctions.com/xq/functx-1.0-doc-2007-01.xq";
declare variable $COLL as document-node()+ := collection("[input_url]");
let $non-BasicLatin_all := $COLL//text()[matches(., '\P{IsBasicLatin}')]
let $non-BasicLatin_item :=
for $x in $non-BasicLatin_all
return functx:get-matches($x, '\P{IsBasicLatin}')
let $distinct_character :=
for $x in distinct-values($non-BasicLatin_item)
return <character>{$x}</character>
return
<characters distinct-count="{count($distinct_character)}">
{$distinct_character}
</characters>
答案 0 :(得分:1)
Unicode字符集数据库的XML版本可从the Unicode Consortium获得;数据的平面和分组版本都包含具有以下形式的块列表:
<blocks>
<block first-cp="0000" last-cp="007F" name="Basic Latin"/>
<block first-cp="0080" last-cp="00FF" name="Latin-1 Supplement"/>
<block first-cp="0100" last-cp="017F" name="Latin Extended-A"/>
<block first-cp="0180" last-cp="024F" name="Latin Extended-B"/>
...
<block first-cp="2F800" last-cp="2FA1F" name="CJK Compatibility Ideographs Supplement"/>
<block first-cp="E0000" last-cp="E007F" name="Tags"/>
<block first-cp="E0100" last-cp="E01EF" name="Variation Selectors Supplement"/>
<block first-cp="F0000" last-cp="FFFFF" name="Supplementary Private Use Area-A"/>
<block first-cp="100000" last-cp="10FFFF" name="Supplementary Private Use Area-B"/>
</blocks>
编写一个接受字符的函数,查询数据库中的块范围(将它们从十六进制转换为十进制进行比较),并返回块的名称(或其块元素)。如果速度成为一个问题,请复制一个阻止列表并对所有进行一次十进制到十进制的转换。
请注意,在Unicode的生命周期中,某些块名称和一些块边界已发生变化;您需要决定要使用的数据库版本。
答案 1 :(得分:0)
XQuery中没有直接的功能。您必须解析字符的代码点并找到一个数据库来自行解析unicode块,可能最适合XML文档。
使用string-to-codepoints('漢字')
可以实现解析代码点,这将返回一个整数序列。
答案 2 :(得分:0)
如果其他人可能觉得这很有用,这就是我根据Jens Erat和C.M.提出的。 Sperberg-McQueer的建议(使用Unicode 6.3.0):
xquery version "1.0";
declare namespace ead = "urn:isbn:1-931666-22-9";
declare default element namespace "urn:isbn:1-931666-22-9";
declare copy-namespaces no-preserve, inherit;
import module namespace functx="http://www.functx.com"
at "http://www.xqueryfunctions.com/xq/functx-1.0-doc-2007-01.xq";
declare variable $COLL as document-node()+ := collection("[input_directory]");
let $non-BasicLatin_all := $COLL//text()[matches(., '\P{IsBasicLatin}')]
let $non-BasicLatin_item :=
for $x in $non-BasicLatin_all
return functx:get-matches($x, '\P{IsBasicLatin}')
let $distinct_character :=
for $x in distinct-values($non-BasicLatin_item)
order by $x
return
if($x[not(.='')])
then string-to-codepoints($x)
else()
let $block-name :=
for $x in distinct-values($distinct_character)
return
if (functx:between-inclusive($x, 0, 127)) then "Basic Latin"
else if (functx:between-inclusive($x, 128, 255)) then "Latin-1 Supplement"
else if (functx:between-inclusive($x, 256, 383)) then "Latin Extended-A"
else if (functx:between-inclusive($x, 384, 591)) then "Latin Extended-B"
else if (functx:between-inclusive($x, 592, 687)) then "IPA Extensions"
else if (functx:between-inclusive($x, 688, 767)) then "Spacing Modifier Letters"
else if (functx:between-inclusive($x, 768, 879)) then "Combining Diacritical Marks"
else if (functx:between-inclusive($x, 880, 1023)) then "Greek and Coptic"
else if (functx:between-inclusive($x, 1024, 1279)) then "Cyrillic"
else if (functx:between-inclusive($x, 1280, 1327)) then "Cyrillic Supplement"
else if (functx:between-inclusive($x, 1328, 1423)) then "Armenian"
else if (functx:between-inclusive($x, 1424, 1535)) then "Hebrew"
else if (functx:between-inclusive($x, 1536, 1791)) then "Arabic"
else if (functx:between-inclusive($x, 1792, 1871)) then "Syriac"
else if (functx:between-inclusive($x, 1872, 1919)) then "Arabic Supplement"
else if (functx:between-inclusive($x, 1920, 1983)) then "Thaana"
else if (functx:between-inclusive($x, 1984, 2047)) then "NKo"
else if (functx:between-inclusive($x, 2048, 2111)) then "Samaritan"
else if (functx:between-inclusive($x, 2112, 2143)) then "Mandaic"
else if (functx:between-inclusive($x, 2208, 2303)) then "Arabic Extended-A"
else if (functx:between-inclusive($x, 2304, 2431)) then "Devanagari"
else if (functx:between-inclusive($x, 2432, 2559)) then "Bengali"
else if (functx:between-inclusive($x, 2560, 2687)) then "Gurmukhi"
else if (functx:between-inclusive($x, 2688, 2815)) then "Gujarati"
else if (functx:between-inclusive($x, 2816, 2943)) then "Oriya"
else if (functx:between-inclusive($x, 2944, 3071)) then "Tamil"
else if (functx:between-inclusive($x, 3072, 3199)) then "Telugu"
else if (functx:between-inclusive($x, 3200, 3327)) then "Kannada"
else if (functx:between-inclusive($x, 3328, 3455)) then "Malayalam"
else if (functx:between-inclusive($x, 3456, 3583)) then "Sinhala"
else if (functx:between-inclusive($x, 3584, 3711)) then "Thai"
else if (functx:between-inclusive($x, 3712, 3839)) then "Lao"
else if (functx:between-inclusive($x, 3840, 4095)) then "Tibetan"
else if (functx:between-inclusive($x, 4096, 4255)) then "Myanmar"
else if (functx:between-inclusive($x, 4256, 4351)) then "Georgian"
else if (functx:between-inclusive($x, 4352, 4607)) then "Hangul Jamo"
else if (functx:between-inclusive($x, 4608, 4991)) then "Ethiopic"
else if (functx:between-inclusive($x, 4992, 5023)) then "Ethiopic Supplement"
else if (functx:between-inclusive($x, 5024, 5119)) then "Cherokee"
else if (functx:between-inclusive($x, 5120, 5759)) then "Unified Canadian Aboriginal Syllabics"
else if (functx:between-inclusive($x, 5760, 5791)) then "Ogham"
else if (functx:between-inclusive($x, 5792, 5887)) then "Runic"
else if (functx:between-inclusive($x, 5888, 5919)) then "Tagalog"
else if (functx:between-inclusive($x, 5920, 5951)) then "Hanunoo"
else if (functx:between-inclusive($x, 5952, 5983)) then "Buhid"
else if (functx:between-inclusive($x, 5984, 6015)) then "Tagbanwa"
else if (functx:between-inclusive($x, 6016, 6143)) then "Khmer"
else if (functx:between-inclusive($x, 6144, 6319)) then "Mongolian"
else if (functx:between-inclusive($x, 6320, 6399)) then "Unified Canadian Aboriginal Syllabics Extended"
else if (functx:between-inclusive($x, 6400, 6479)) then "Limbu"
else if (functx:between-inclusive($x, 6480, 6527)) then "Tai Le"
else if (functx:between-inclusive($x, 6528, 6623)) then "New Tai Lue"
else if (functx:between-inclusive($x, 6624, 6655)) then "Khmer Symbols"
else if (functx:between-inclusive($x, 6656, 6687)) then "Buginese"
else if (functx:between-inclusive($x, 6688, 6831)) then "Tai Tham"
else if (functx:between-inclusive($x, 6912, 7039)) then "Balinese"
else if (functx:between-inclusive($x, 7040, 7103)) then "Sundanese"
else if (functx:between-inclusive($x, 7104, 7167)) then "Batak"
else if (functx:between-inclusive($x, 7168, 7247)) then "Lepcha"
else if (functx:between-inclusive($x, 7248, 7295)) then "Ol Chiki"
else if (functx:between-inclusive($x, 7360, 7375)) then "Sundanese Supplement"
else if (functx:between-inclusive($x, 7376, 7423)) then "Vedic Extensions"
else if (functx:between-inclusive($x, 7424, 7551)) then "Phonetic Extensions"
else if (functx:between-inclusive($x, 7552, 7615)) then "Phonetic Extensions Supplement"
else if (functx:between-inclusive($x, 7616, 7679)) then "Combining Diacritical Marks Supplement"
else if (functx:between-inclusive($x, 7680, 7935)) then "Latin Extended Additional"
else if (functx:between-inclusive($x, 7936, 8191)) then "Greek Extended"
else if (functx:between-inclusive($x, 8192, 8303)) then "General Punctuation"
else if (functx:between-inclusive($x, 8304, 8351)) then "Superscripts and Subscripts"
else if (functx:between-inclusive($x, 8352, 8399)) then "Currency Symbols"
else if (functx:between-inclusive($x, 8400, 8447)) then "Combining Diacritical Marks for Symbols"
else if (functx:between-inclusive($x, 8448, 8527)) then "Letterlike Symbols"
else if (functx:between-inclusive($x, 8528, 8591)) then "Number Forms"
else if (functx:between-inclusive($x, 8592, 8703)) then "Arrows"
else if (functx:between-inclusive($x, 8704, 8959)) then "Mathematical Operators"
else if (functx:between-inclusive($x, 8960, 9215)) then "Miscellaneous Technical"
else if (functx:between-inclusive($x, 9216, 9279)) then "Control Pictures"
else if (functx:between-inclusive($x, 9280, 9311)) then "Optical Character Recognition"
else if (functx:between-inclusive($x, 9312, 9471)) then "Enclosed Alphanumerics"
else if (functx:between-inclusive($x, 9472, 9599)) then "Box Drawing"
else if (functx:between-inclusive($x, 9600, 9631)) then "Block Elements"
else if (functx:between-inclusive($x, 9632, 9727)) then "Geometric Shapes"
else if (functx:between-inclusive($x, 9728, 9983)) then "Miscellaneous Symbols"
else if (functx:between-inclusive($x, 9984, 10175)) then "Dingbats"
else if (functx:between-inclusive($x, 10176, 10223)) then "Miscellaneous Mathematical Symbols-A"
else if (functx:between-inclusive($x, 10224, 10239)) then "Supplemental Arrows-A"
else if (functx:between-inclusive($x, 10240, 10495)) then "Braille Patterns"
else if (functx:between-inclusive($x, 10496, 10623)) then "Supplemental Arrows-B"
else if (functx:between-inclusive($x, 10624, 10751)) then "Miscellaneous Mathematical Symbols-B"
else if (functx:between-inclusive($x, 10752, 11007)) then "Supplemental Mathematical Operators"
else if (functx:between-inclusive($x, 11008, 11263)) then "Miscellaneous Symbols and Arrows"
else if (functx:between-inclusive($x, 11264, 11359)) then "Glagolitic"
else if (functx:between-inclusive($x, 11360, 11391)) then "Latin Extended-C"
else if (functx:between-inclusive($x, 11392, 11519)) then "Coptic"
else if (functx:between-inclusive($x, 11520, 11567)) then "Georgian Supplement"
else if (functx:between-inclusive($x, 11568, 11647)) then "Tifinagh"
else if (functx:between-inclusive($x, 11648, 11743)) then "Ethiopic Extended"
else if (functx:between-inclusive($x, 11744, 11775)) then "Cyrillic Extended-A"
else if (functx:between-inclusive($x, 11776, 11903)) then "Supplemental Punctuation"
else if (functx:between-inclusive($x, 11904, 12031)) then "CJK Radicals Supplement"
else if (functx:between-inclusive($x, 12032, 12255)) then "Kangxi Radicals"
else if (functx:between-inclusive($x, 12272, 12287)) then "Ideographic Description Characters"
else if (functx:between-inclusive($x, 12288, 12351)) then "CJK Symbols and Punctuation"
else if (functx:between-inclusive($x, 12352, 12447)) then "Hiragana"
else if (functx:between-inclusive($x, 12448, 12543)) then "Katakana"
else if (functx:between-inclusive($x, 12544, 12591)) then "Bopomofo"
else if (functx:between-inclusive($x, 12592, 12687)) then "Hangul Compatibility Jamo"
else if (functx:between-inclusive($x, 12688, 12703)) then "Kanbun"
else if (functx:between-inclusive($x, 12704, 12735)) then "Bopomofo Extended"
else if (functx:between-inclusive($x, 12736, 12783)) then "CJK Strokes"
else if (functx:between-inclusive($x, 12784, 12799)) then "Katakana Phonetic Extensions"
else if (functx:between-inclusive($x, 12800, 13055)) then "Enclosed CJK Letters and Months"
else if (functx:between-inclusive($x, 13056, 13311)) then "CJK Compatibility"
else if (functx:between-inclusive($x, 13312, 19903)) then "CJK Unified Ideographs Extension A"
else if (functx:between-inclusive($x, 19904, 19967)) then "Yijing Hexagram Symbols"
else if (functx:between-inclusive($x, 19968, 40959)) then "CJK Unified Ideographs"
else if (functx:between-inclusive($x, 40960, 42127)) then "Yi Syllables"
else if (functx:between-inclusive($x, 42128, 42191)) then "Yi Radicals"
else if (functx:between-inclusive($x, 42192, 42239)) then "Lisu"
else if (functx:between-inclusive($x, 42240, 42559)) then "Vai"
else if (functx:between-inclusive($x, 42560, 42655)) then "Cyrillic Extended-B"
else if (functx:between-inclusive($x, 42656, 42751)) then "Bamum"
else if (functx:between-inclusive($x, 42752, 42783)) then "Modifier Tone Letters"
else if (functx:between-inclusive($x, 42784, 43007)) then "Latin Extended-D"
else if (functx:between-inclusive($x, 43008, 43055)) then "Syloti Nagri"
else if (functx:between-inclusive($x, 43056, 43071)) then "Common Indic Number Forms"
else if (functx:between-inclusive($x, 43072, 43135)) then "Phags-pa"
else if (functx:between-inclusive($x, 43136, 43231)) then "Saurashtra"
else if (functx:between-inclusive($x, 43232, 43263)) then "Devanagari Extended"
else if (functx:between-inclusive($x, 43264, 43311)) then "Kayah Li"
else if (functx:between-inclusive($x, 43312, 43359)) then "Rejang"
else if (functx:between-inclusive($x, 43360, 43391)) then "Hangul Jamo Extended-A"
else if (functx:between-inclusive($x, 43392, 43487)) then "Javanese"
else if (functx:between-inclusive($x, 43520, 43615)) then "Cham"
else if (functx:between-inclusive($x, 43616, 43647)) then "Myanmar Extended-A"
else if (functx:between-inclusive($x, 43648, 43743)) then "Tai Viet"
else if (functx:between-inclusive($x, 43744, 43775)) then "Meetei Mayek Extensions"
else if (functx:between-inclusive($x, 43776, 43823)) then "Ethiopic Extended-A"
else if (functx:between-inclusive($x, 43968, 44031)) then "Meetei Mayek"
else if (functx:between-inclusive($x, 44032, 55215)) then "Hangul Syllables"
else if (functx:between-inclusive($x, 55216, 55295)) then "Hangul Jamo Extended-B"
else if (functx:between-inclusive($x, 55296, 56191)) then "High Surrogates"
else if (functx:between-inclusive($x, 56192, 56319)) then "High Private Use Surrogates"
else if (functx:between-inclusive($x, 56320, 57343)) then "Low Surrogates"
else if (functx:between-inclusive($x, 57344, 63743)) then "Private Use Area"
else if (functx:between-inclusive($x, 63744, 64255)) then "CJK Compatibility Ideographs"
else if (functx:between-inclusive($x, 64256, 64335)) then "Alphabetic Presentation Forms"
else if (functx:between-inclusive($x, 64336, 65023)) then "Arabic Presentation Forms-A"
else if (functx:between-inclusive($x, 65024, 65039)) then "Variation Selectors"
else if (functx:between-inclusive($x, 65040, 65055)) then "Vertical Forms"
else if (functx:between-inclusive($x, 65056, 65071)) then "Combining Half Marks"
else if (functx:between-inclusive($x, 65072, 65103)) then "CJK Compatibility Forms"
else if (functx:between-inclusive($x, 65104, 65135)) then "Small Form Variants"
else if (functx:between-inclusive($x, 65136, 65279)) then "Arabic Presentation Forms-B"
else if (functx:between-inclusive($x, 65280, 65519)) then "Halfwidth and Fullwidth Forms"
else if (functx:between-inclusive($x, 65520, 65535)) then "Specials"
else if (functx:between-inclusive($x, 65536, 65663)) then "Linear B Syllabary"
else if (functx:between-inclusive($x, 65664, 65791)) then "Linear B Ideograms"
else if (functx:between-inclusive($x, 65792, 65855)) then "Aegean Numbers"
else if (functx:between-inclusive($x, 65856, 65935)) then "Ancient Greek Numbers"
else if (functx:between-inclusive($x, 65936, 65999)) then "Ancient Symbols"
else if (functx:between-inclusive($x, 66000, 66047)) then "Phaistos Disc"
else if (functx:between-inclusive($x, 66176, 66207)) then "Lycian"
else if (functx:between-inclusive($x, 66208, 66271)) then "Carian"
else if (functx:between-inclusive($x, 66304, 66351)) then "Old Italic"
else if (functx:between-inclusive($x, 66352, 66383)) then "Gothic"
else if (functx:between-inclusive($x, 66432, 66463)) then "Ugaritic"
else if (functx:between-inclusive($x, 66464, 66527)) then "Old Persian"
else if (functx:between-inclusive($x, 66560, 66639)) then "Deseret"
else if (functx:between-inclusive($x, 66640, 66687)) then "Shavian"
else if (functx:between-inclusive($x, 66688, 66735)) then "Osmanya"
else if (functx:between-inclusive($x, 67584, 67647)) then "Cypriot Syllabary"
else if (functx:between-inclusive($x, 67648, 67679)) then "Imperial Aramaic"
else if (functx:between-inclusive($x, 67840, 67871)) then "Phoenician"
else if (functx:between-inclusive($x, 67872, 67903)) then "Lydian"
else if (functx:between-inclusive($x, 67968, 67999)) then "Meroitic Hieroglyphs"
else if (functx:between-inclusive($x, 68000, 68095)) then "Meroitic Cursive"
else if (functx:between-inclusive($x, 68096, 68191)) then "Kharoshthi"
else if (functx:between-inclusive($x, 68192, 68223)) then "Old South Arabian"
else if (functx:between-inclusive($x, 68352, 68415)) then "Avestan"
else if (functx:between-inclusive($x, 68416, 68447)) then "Inscriptional Parthian"
else if (functx:between-inclusive($x, 68448, 68479)) then "Inscriptional Pahlavi"
else if (functx:between-inclusive($x, 68608, 68687)) then "Old Turkic"
else if (functx:between-inclusive($x, 69216, 69247)) then "Rumi Numeral Symbols"
else if (functx:between-inclusive($x, 69632, 69759)) then "Brahmi"
else if (functx:between-inclusive($x, 69760, 69839)) then "Kaithi"
else if (functx:between-inclusive($x, 69840, 69887)) then "Sora Sompeng"
else if (functx:between-inclusive($x, 69888, 69967)) then "Chakma"
else if (functx:between-inclusive($x, 70016, 70111)) then "Sharada"
else if (functx:between-inclusive($x, 71296, 71375)) then "Takri"
else if (functx:between-inclusive($x, 73728, 74751)) then "Cuneiform"
else if (functx:between-inclusive($x, 74752, 74879)) then "Cuneiform Numbers and Punctuation"
else if (functx:between-inclusive($x, 77824, 78895)) then "Egyptian Hieroglyphs"
else if (functx:between-inclusive($x, 92160, 92735)) then "Bamum Supplement"
else if (functx:between-inclusive($x, 93952, 94111)) then "Miao"
else if (functx:between-inclusive($x, 110592, 110847)) then "Kana Supplement"
else if (functx:between-inclusive($x, 118784, 119039)) then "Byzantine Musical Symbols"
else if (functx:between-inclusive($x, 119040, 119295)) then "Musical Symbols"
else if (functx:between-inclusive($x, 119296, 119375)) then "Ancient Greek Musical Notation"
else if (functx:between-inclusive($x, 119552, 119647)) then "Tai Xuan Jing Symbols"
else if (functx:between-inclusive($x, 119648, 119679)) then "Counting Rod Numerals"
else if (functx:between-inclusive($x, 119808, 120831)) then "Mathematical Alphanumeric Symbols"
else if (functx:between-inclusive($x, 126464, 126719)) then "Arabic Mathematical Alphabetic Symbols"
else if (functx:between-inclusive($x, 126976, 127023)) then "Mahjong Tiles"
else if (functx:between-inclusive($x, 127024, 127135)) then "Domino Tiles"
else if (functx:between-inclusive($x, 127136, 127231)) then "Playing Cards"
else if (functx:between-inclusive($x, 127232, 127487)) then "Enclosed Alphanumeric Supplement"
else if (functx:between-inclusive($x, 127488, 127743)) then "Enclosed Ideographic Supplement"
else if (functx:between-inclusive($x, 127744, 128511)) then "Miscellaneous Symbols And Pictographs"
else if (functx:between-inclusive($x, 128512, 128591)) then "Emoticons"
else if (functx:between-inclusive($x, 128640, 128767)) then "Transport And Map Symbols"
else if (functx:between-inclusive($x, 128768, 128895)) then "Alchemical Symbols"
else if (functx:between-inclusive($x, 131072, 173791)) then "CJK Unified Ideographs Extension B"
else if (functx:between-inclusive($x, 173824, 177983)) then "CJK Unified Ideographs Extension C"
else if (functx:between-inclusive($x, 177984, 178207)) then "CJK Unified Ideographs Extension D"
else if (functx:between-inclusive($x, 194560, 195103)) then "CJK Compatibility Ideographs Supplement"
else if (functx:between-inclusive($x, 917504, 917631)) then "Tags"
else if (functx:between-inclusive($x, 917760, 917999)) then "Variation Selectors Supplement"
else if (functx:between-inclusive($x, 983040, 1048575)) then "Supplementary Private Use Area-A"
else if (functx:between-inclusive($x, 1048576, 1114111)) then "Supplementary Private Use Area-B"
else()
let $distinct_block-name :=
for $x in distinct-values($block-name)
order by $x
return <block>{$x}</block>
return
<distinct-blocks distinct-block-count="{count($distinct_block-name)}" distinct-character-count="{count($distinct_character)}">
{$distinct_block-name}
</distinct-blocks>