获取波斯字符初始内侧最终unicode

时间:2014-01-21 08:40:40

标签: c# unicode-string persian

我希望将Unicode波斯语字符放在一个单词中。

例如将“چارت”转换为“Ê”''''''''''

我发现ArabicLigaturizer可以用阿拉伯语做我需要的。

在.net中有没有一种简单的方法为波斯语做这个?

我使用此代码

    (System.Text.RegularExpressions.Regex.IsMatch(SearchText, @"\p{IsArabic}"))
 SearchText = (new iTextSharp.text.pdf.ArabicLigaturizer()).Process(SearchText);

但仍有一些文件有问题。我认为这取决于PdfWriter。

2 个答案:

答案 0 :(得分:2)

我为我的一个项目编写了一个C#代码,对波斯文本进行硬编码并转换它们,希望这有助于它(它也会反转它们,因为我为Unity3D编写了不支持RTL的代码): / p>

//This library maps persian characters to unicode and makes them rtl, in order to use in softwares like Unity3D
//Author: Shayan Edalatmanesh

using System.Collections.Generic;
using System.Text.RegularExpressions;

public static class PersianMap
{
    static string persianChars = "ئءؤرلایيةوزژظشسيبپلاتنمكکگطضصثقفغعهخحچجدذْلآآلأأـلإإًٌٍَُِّْٰ";
    static string tashkil = "ًٌٍَُِّْٰ";
static Dictionary<char?, char[]> map = new Dictionary<char?, char[]>()
{
        {'آ', new char[] {'ﺁ', 'ﺁ', 'ﺂ', 'ﺂ'}}, 
        {'ا', new char[] {'ﺍ', 'ﺍ', 'ﺎ', 'ﺎ'}}, 
        {'أ', new char[] {'ﺃ', 'ﺃ', 'ﺄ', 'ﺄ'}}, 
        {'إ', new char[] {'ﺇ', 'ﺇ', 'ﺈ', 'ﺈ'}}, 
        {'ب', new char[] {'ﺏ', 'ﺑ', 'ﺒ', 'ﺐ'}}, 
        {'پ', new char[] {'ﭖ', 'ﭘ', 'ﭙ', 'ﭗ'}}, 
        {'ت', new char[] {'ﺕ', 'ﺗ', 'ﺘ', 'ﺖ'}}, 
        {'ث', new char[] {'ﺙ', 'ﺛ', 'ﺜ', 'ﺚ'}}, 
        {'ج', new char[] {'ﺝ', 'ﺟ', 'ﺠ', 'ﺞ'}}, 
        {'چ', new char[] {'ﭺ', 'ﭼ', 'ﭽ', 'ﭻ'}}, 
        {'ح', new char[] {'ﺡ', 'ﺣ', 'ﺤ', 'ﺢ'}}, 
        {'خ', new char[] {'ﺥ', 'ﺧ', 'ﺨ', 'ﺦ'}}, 
        {'د', new char[] {'ﺩ', 'ﺩ', 'ﺪ', 'ﺪ'}}, 
        {'ذ', new char[] {'ﺫ', 'ﺫ', 'ﺬ', 'ﺬ'}}, 
        {'ر', new char[] {'ﺭ', 'ﺭ', 'ﺮ', 'ﺮ'}}, 
        {'ز', new char[] {'ﺯ', 'ﺯ', 'ﺰ', 'ﺰ'}}, 
        {'ژ', new char[] {'ﮊ', 'ﮊ', 'ﮋ', 'ﮋ'}}, 
        {'س', new char[] {'ﺱ', 'ﺳ', 'ﺴ', 'ﺲ'}}, 
        {'ش', new char[] {'ﺵ', 'ﺷ', 'ﺸ', 'ﺶ'}}, 
        {'ص', new char[] {'ﺹ', 'ﺻ', 'ﺼ', 'ﺺ'}}, 
        {'ض', new char[] {'ﺽ', 'ﺿ', 'ﻀ', 'ﺾ'}}, 
        {'ط', new char[] {'ﻁ', 'ﻃ', 'ﻄ', 'ﻂ'}}, 
        {'ظ', new char[] {'ﻅ', 'ﻇ', 'ﻈ', 'ﻆ'}}, 
        {'ع', new char[] {'ﻉ', 'ﻋ', 'ﻌ', 'ﻊ'}}, 
        {'غ', new char[] {'ﻍ', 'ﻏ', 'ﻐ', 'ﻎ'}}, 
        {'ف', new char[] {'ﻑ', 'ﻓ', 'ﻔ', 'ﻒ'}}, 
        {'ق', new char[] {'ﻕ', 'ﻗ', 'ﻘ', 'ﻖ'}}, 
        {'ک', new char[] {'ﮎ', 'ﮐ', 'ﮑ', 'ﮏ'}}, 
        {'ك', new char[] {'ﻙ', 'ﻛ', 'ﻜ', 'ﻚ'}}, 
        {'گ', new char[] {'ﮒ', 'ﮔ', 'ﮕ', 'ﮓ'}}, 
        {'ل', new char[] {'ﻝ', 'ﻟ', 'ﻠ', 'ﻞ'}}, 
        {'م', new char[] {'ﻡ', 'ﻣ', 'ﻤ', 'ﻢ'}}, 
        {'ن', new char[] {'ﻥ', 'ﻧ', 'ﻨ', 'ﻦ'}}, 
        {'و', new char[] {'ﻭ', 'ﻭ', 'ﻮ', 'ﻮ'}}, 
        {'ؤ', new char[] {'ﺅ', 'ﺅ', 'ﺆ', 'ﺆ'}}, 
        {'ه', new char[] {'ﻩ', 'ﻫ', 'ﻬ', 'ﻪ'}}, 
        {'ة', new char[] {'ﺓ', 'ﺓ', 'ﺔ', 'ﺔ'}}, 
        {'ی', new char[] {'ﯼ', 'ﯾ', 'ﯿ', 'ﯽ'}}, 
        {'ي', new char[] {'ﻱ', 'ﻳ', 'ﻴ', 'ﻲ'}}, 
        {'ئ', new char[] {'ﺉ', 'ﺋ', 'ﺌ', 'ﺊ'}}, 
        {'ـ', new char[] { 'ـ', 'ـ', 'ـ', 'ـ'}},
};

static Dictionary<string, char[]> doubleMap = new Dictionary<string, char[]>()
{
    {"لآ", new char[] {'ﻵ', 'ﻵ', 'ﻶ', 'ﻶ'}}, 
    {"لا", new char[] {'ﻻ', 'ﻻ', 'ﻼ', 'ﻼ'}}, 
    {"لأ", new char[] {'ﻷ', 'ﻷ', 'ﻸ', 'ﻸ'}}, 
    {"لإ", new char[] {'ﻹ', 'ﻹ', 'ﻺ', 'ﻺ'}}, 
};

//The letters that after them, next letter will be stuck
static List<char?> afterStick = new List<char?>()
{
    'ب','پ','ت','ث','ج','چ','ح','خ','س','ش','ص','ض','ط','ظ','ع','غ','ف','ق','ک','ك','گ',
    'ل','م','ن','ه','ی','ي','ئ','ـ',
};

//0: Isolated
//1: Begin
//2: Middle
//3: End
static int getGlyphType(char chr, char? before, char? after)
{
    if (before == null && after == null)
        return 0;

    bool afs_chr = afterStick.Contains(chr);
    bool afs_bfr = before != null && afterStick.Contains(before);

    if (afs_chr && after != null && map.ContainsKey(after))
    {
        if (afs_chr && afs_bfr)
            return 2;
        else
            return 1;
    }
    else
    {
        if (afs_bfr)
            return 3;
        else
            return 0;
    }
}

static string convertWord(string word)
{
    string result = "";
    //Convert word
    for (int i = 0; i < word.Length; i++)
    {
        if (!map.ContainsKey(word[i]))
        {
            result += word[i].ToString();
            continue;
        }

        char? before = null, after = null;

        //Find the previous letter that is not tashkil
        for (int k = i - 1; k >= 0; k--)
        {
            if (!tashkil.Contains(word[k].ToString()))
            {
                before = word[k];
                break;
            }
        }

        //Find the next letter that is not tashkil
        for (int k = i + 1; k < word.Length; k++)
        {
            if (!tashkil.Contains(word[k].ToString()))
            {
                after = word[k];
                break;
            }
        }           

        int glyph = getGlyphType(word[i], before, after);

        //Consider doubleMap
        if (after != null)
        {
            string dm = word[i].ToString() + after.ToString();
            if (doubleMap.ContainsKey(dm))
            {
                result += doubleMap[dm][glyph].ToString();
                i++; //Skip the next letter
                continue;
            }
        }

        char mapped = map[word[i]][glyph];

        result += mapped.ToString();
    }

    //Reverse
    int len = result.Length;
    char[] rev = new char[len];
    for (int i = 0; i < len; i++)
    {
        rev[i] = result[len - i - 1];
    }

    return new string(rev);
}

static Regex wordRE = new Regex("(?<fa>[" + persianChars + "]*)(?<nfa>[^" + persianChars + "]*)");
static Regex spaceRE = new Regex("( )*([^ ]*)( )*");
static Regex persianRE = new Regex("[" + persianChars + "]");
static string convertLine(string line)
{
    //Does not contain any persian characters
    if (!persianRE.IsMatch(line))
        return line;

    string res = "";
    MatchCollection matches = wordRE.Matches(line);
    for (int i = matches.Count - 1; i >= 0; i--)
    {
        //If the non-persian word contains space, put the space on it's other side
        res += spaceRE.Replace(matches[i].Groups["nfa"].Value, "$3$2$1");

        //Convert the persian part
        res += convertWord(matches[i].Groups["fa"].Value);
    }
    return res;
}

public static string ConvertPersian(string str)
{       
    string[] lines = str.Split('\n');
    for (int i = 0; i < lines.Length; i++)
    {
        lines[i] = convertLine(lines[i]);
    }
    return string.Join("\n", lines);
}

public static string FixPersian(this string str)
{
    return ConvertPersian(str);
}
}

答案 1 :(得分:0)

Persian只有Arabic多个字符,一般来说,您应该能够将Arabic工具与Persian一起使用。

顺便说一下,您可以使用Fardis project来提取每个字符的Unicode代码。你也可以使用源代码。