我有一个问题,我需要将输入字符串拆分为可能的Prefix
,Stem
和Suffix
。
规则是:
Prefix
= 0-4个字符
Stem
= 1- * chars
Suffix
= 0-6个字符。
假设我输入“wbAlErbyp”需要拆分(不完整样本):
我是如何实现这一目标的?
编辑1:
好的,这是我的旧解决方案(它很长很不专业),我不再理解了,想重新设计。
public static List<string> GetMatches(string inputTextArabic)
{
// store matches/results here
List<string> results = new List<string>();
char[] arabicChars = inputTextArabic.ToCharArray();
// convert all the arabic chars from array
// into latin chars in array
string latString = "";
for (int i = 0; i < arabicChars.Length; i++)
{
switch (arabicChars[i])
{
#region ARABIC TO LATIN TABLE
case 'ا':
latString += "A";
break;
case 'آ':
latString += "|";
break;
case 'ؤ':
latString += "&";
break;
case 'ئ':
latString += "}";
break;
case 'أ':
latString += ">";
break;
case 'إ':
latString += "<";
break;
case 'ء':
latString += @"\";
break;
case 'ب':
latString += "b";
break;
case 'ت':
latString += "t";
break;
case 'ة':
latString += "p";
break;
case 'ث':
latString += "v";
break;
case 'ج':
latString += "j";
break;
case 'ح':
latString += "H";
break;
case 'خ':
latString += "x";
break;
case 'د':
latString += "d";
break;
case 'ذ':
latString += "*";
break;
case 'ر':
latString += "r";
break;
case 'ز':
latString += "z";
break;
case 'س':
latString += "s";
break;
case 'ش':
latString += "$";
break;
case 'ص':
latString += "S";
break;
case 'ض':
latString += "D";
break;
case 'ط':
latString += "T";
break;
case 'ظ':
latString += "Z";
break;
case 'ع':
latString += "E";
break;
case 'غ':
latString += "g";
break;
case 'ـ':
latString += "_";
break;
case 'ف':
latString += "f";
break;
case 'ق':
latString += "q";
break;
case 'ك':
latString += "k";
break;
case 'ل':
latString += "l";
break;
case 'م':
latString += "m";
break;
case 'ن':
latString += "n";
break;
case 'ه':
latString += "h";
break;
case 'و':
latString += "w";
break;
case 'ى':
latString += "Y";
break;
case 'ي':
latString += "y";
break;
case 'ً':
latString += "F";
break;
case 'ٌ':
latString += "N";
break;
case 'ٍ':
latString += "K";
break;
case 'َ':
latString += "a";
break;
case 'ُ':
latString += "u";
break;
case 'ِ':
latString += "i";
break;
case 'ّ':
latString += "~";
break;
case 'ْ':
latString += "o";
break;
#endregion
}
}
// loop thru different stem sizes
// stem is 1-*
int lenWord = latString.Length;
for (int lenStem = 1; lenStem <= lenWord; lenStem++)
{
// set max prefix size, strd is 4 but could be
// less depending on word size
int lenPrefMax = 4;
if (lenWord - lenStem < lenPrefMax)
{
lenPrefMax = lenWord - lenStem;
}
// loop thru different prefix sizes
// based on the max above
for (int lenPref = 0; lenPref <= lenPrefMax; lenPref++)
{
// set suffix max, std is 6, but could be
// less depending on word size
int lenSuffMax = 6;
if (lenWord - lenStem - lenPref < lenSuffMax)
{
lenSuffMax = lenWord - lenStem - lenPref;
}
// loop thru different suffix sizes
// based on the max above
for (int lenSuff = 0; lenSuff <= lenSuffMax; lenSuff++)
{
// if sum of parts doesnt equal word size
// it means its not a proper match, thus skip
if (lenPref + lenStem + lenSuff < lenWord)
continue;
// otherwise, these are the possible word bits
string prefix = latString.Substring(0, lenPref);
string stem = latString.Substring(lenPref, lenStem);
string suffix = latString.Substring(lenPref + lenStem, lenSuff);
// now see if they all exist in the relevant places
List<WordBit> prefMatches = (from x in prefixes where x.NoVowels == prefix select x).Distinct().ToList();
List<WordBit> stemMatches = (from x in stems where x.NoVowels == stem select x).Distinct().ToList();
List<WordBit> suffMatches = (from x in suffixes where x.NoVowels == suffix select x).Distinct().ToList();
if (!(prefMatches.Count > 0 && stemMatches.Count > 0 && suffMatches.Count > 0))
break;
// Now that they are found, see if they go together
// For each prefix, loop through every stem
foreach(WordBit prefMatch in prefMatches)
{
// for each stem, loop through all suffixes
foreach (WordBit stemMatch in stemMatches)
{
// Now we know there is a prefix, suffix and stem
foreach (WordBit suffMatch in suffMatches)
{
// get their types
string prefType = prefMatch.Type;
string stemType = stemMatch.Type;
string suffType = suffMatch.Type;
// find out if the types are compatible
bool prefStemConnects = (from x in prefixStemConns where x.Type1 == prefType && x.Type2 == stemType select x).Count() > 0;
bool stemSuffConnects = (from x in stemSuffixConns where x.Type1 == stemType && x.Type2 == suffType select x).Count() > 0;
bool prefSuffConnects = (from x in prefixSuffixConns where x.Type1 == prefType && x.Type2 == suffType select x).Count() > 0;
// they all connect
// we have found a match!
if (prefStemConnects && stemSuffConnects && prefStemConnects)
{
Match match = new Match();
//match.MatchMeaning = "";
// 1. prefix
// 2. stem
// 3.
//takeplusesout
match.MatchMeaning = match.RootMeaning = Regex.Match(stemMatch.Extra, @"^.*?(?=\s\s|$)").ToString();
// [fem.sg.] = I
match.SuffixInfo = Regex.Match(suffMatch.Extra, @"^.*?(?=\s\s)").ToString();
if (match.SuffixInfo != "")
{
if (match.SuffixInfo.Contains("<verb>"))
{
match.MatchMeaning = match.SuffixInfo.Replace("<verb>", match.RootMeaning);
match.SuffixInfo = "";
}
else
{
match.MatchMeaning = match.MatchMeaning + " " + match.SuffixInfo;
}
}
// Get 1st part of prefix
match.PrefixInfo = Regex.Match(prefMatch.Extra, @"^.*?(?=\s|\s\s|$)").ToString();
if (match.PrefixInfo != "")
{
match.MatchMeaning = match.PrefixInfo + " " + match.RootMeaning + " " + match.SuffixInfo;
}
//results.Add(prefMatch.Extra + "--" + stemMatch.Extra + "--" + suffMatch.Extra);
//if (beforeMeaning != "")
// beforeMeaning += " ";
results.Add(match.MatchMeaning);
Debug.Print("_____________________________________________________________________________________");
Debug.Print(prefMatch.NoVowels + "\t\t" + prefMatch.Vowels + "\t\t" + prefMatch.Type + "\t\t" + prefMatch.Extra);
Debug.Print(stemMatch.NoVowels + "\t\t" + stemMatch.Vowels + "\t\t" + stemMatch.Type + "\t\t" + stemMatch.Extra);
Debug.Print(suffMatch.NoVowels + "\t\t" + suffMatch.Vowels + "\t\t" + suffMatch.Type + "\t\t" + suffMatch.Extra);
Debug.Print("______________________________________________________________________________________");
}
}
}
}
}
}
}
return results;
}
答案 0 :(得分:1)
只需构建两个嵌套循环,迭代所有可能的前缀和后缀长度。
string s="wbAlErbyp";
const int maxPrefixLength = 4;
const int maxSuffixLength = 6;
const int minStemLength = 1;
for(int prefixLength = 0; (prefixLength + minStemLength <= s.Length) && (prefixLength<=maxPrefixLength); prefixLength++)
for(int suffixLength = 0; (suffixLength + prefixLength + minStemLength <= s.Length) && (suffixLength<=maxSuffixLength); suffixLength++)
{
string prefix = s.Substring(0, prefixLength);
string suffix = s.Substring(s.Length-suffixLength);
string stem = s.Substring(prefixLength, s.Length-suffixLength-prefixLength);
Console.WriteLine("{0} {1} {2}",prefix,stem, suffix);
}