我正在尝试将ascii非英文字体中的旧文本转换为新的unicode字体。所以关键是要maped。我必须选择。首先,我有一个像这样的sample.map(txtfile)
的地图文件w=à´‚
x=à´ƒ
A=à´…
B=à´†
C=à´‡
Cu=à´ˆ
D=à´‰
Du=à´Š
E=à´‹
\p=ഌ
F=à´Ž
G=à´
sF=à´
H=à´’
Hm=à´“
Hu=à´”
I=à´•
J=à´–
代码必须用右侧字符替换所有左侧字符。如何循环遍历每个字符并使用地图文件中的信息替换它们。我一直在尝试寻找和替换技术,但失败了。如何让php读取这个特定的map文件,哪个txt文件带有.map扩展名并循环遍历每个char并替换它而不破坏文档?
我还发现了一个python脚本来执行此操作,我无法将其移植到php。在python中非常弱 我在这里粘贴代码:
import sys
import codecs
import os
from optparse import OptionParser
class Payyan:
def __init__(self):
self.input_filename =""
self.output_filename=""
self.mapping_filename=""
self.rulesDict=None
self.pdf=0
def word2ASCII(self, unicode_text):
index = 0
prebase_letter = ""
ascii_text=""
self.direction = "u2a"
self.rulesDict = self.LoadRules()
while index < len(unicode_text):
'''This takes care of conjuncts '''
for charNo in [3,2,1]:
letter = unicode_text[index:index+charNo]
if letter in self.rulesDict:
ascii_letter = self.rulesDict[letter]
letter = letter.encode('utf-8')
'''Fixing the prebase mathra'''
'''TODO: Make it generic , so that usable for all indian languages'''
if letter == 'ൈ':
ascii_text = ascii_text[:-1] + ascii_letter*2 + ascii_text[-1:]
elif (letter == 'ോ') | (letter == 'ൊ') | (letter == 'ൌ'): #prebase+postbase mathra case
ascii_text = ascii_text[:-1] + ascii_letter[0] + ascii_text[-1:] + ascii_letter[1]
elif (letter == 'െ') | (letter == 'േ') |(letter == 'àµà´°'): #only prebase
ascii_text = ascii_text[:-1] + ascii_letter + ascii_text[-1:]
else:
ascii_text = ascii_text + ascii_letter
index = index+charNo
break
else:
if(charNo==1):
index=index+1
ascii_text = ascii_text + letter
break;
'''Did not get'''
ascii_letter = letter
return ascii_text
def Uni2Ascii(self):
if self.input_filename :
uni_file = codecs.open(self.input_filename, encoding = 'utf-8', errors = 'ignore')
else :
uni_file = codecs.open(sys.stdin, encoding = 'utf-8', errors = 'ignore')
text = ""
if self.output_filename :
output_file = codecs.open(self.output_filename, encoding = 'utf-8', errors = 'ignore', mode='w+')
while 1:
text =uni_file.readline()
if text == "":
break
ascii_text = ""
ascii_text = self.word2ASCII(text)
if self.output_filename :
output_file.write(ascii_text)
else:
print ascii_text.encode('utf-8')
return 0
def word2Unicode(self, ascii_text):
index = 0
post_index = 0
prebase_letter = ""
postbase_letter = ""
unicode_text = ""
next_ucode_letter = ""
self.direction="a2u"
self.rulesDict = self.LoadRules()
while index < len(ascii_text):
for charNo in [2,1]:
letter = ascii_text[index:index+charNo]
if letter in self.rulesDict:
unicode_letter = self.rulesDict[letter]
if(self.isPrebase(unicode_letter)):
prebase_letter = unicode_letter
else:
post_index = index+charNo
if post_index < len(ascii_text):
letter = ascii_text[post_index]
if letter in self.rulesDict:
next_ucode_letter = self.rulesDict[letter]
if self.isPostbase(next_ucode_letter):
postbase_letter = next_ucode_letter
index = index + 1
if ((unicode_letter.encode('utf-8') == "à´Ž") |
( unicode_letter.encode('utf-8') == "à´’" )):
unicode_text = unicode_text + postbase_letter + self.getVowelSign(prebase_letter , unicode_letter)
else:
unicode_text = unicode_text + unicode_letter + postbase_letter + prebase_letter
prebase_letter=""
postbase_letter=""
index = index + charNo
break
else:
if charNo == 1:
unicode_text = unicode_text + letter
index = index + 1
break
unicode_letter = letter
return unicode_text
def Ascii2Uni(self):
if self.pdf :
command = "pdftotext '" + self.input_filename +"'"
process = os.popen(command, 'r')
status = process.close()
if status:
print "The input file is a PDF file. To convert this the pdftotext utility is required. "
print "This feature is available only for GNU/Linux Operating system."
return 1 # Error - no pdftotext !
else:
self.input_filename = os.path.splitext(self.input_filename)[0] + ".txt"
if self.input_filename :
ascii_file = codecs.open(self.input_filename, encoding = 'utf-8', errors = 'ignore')
else :
ascii_file = codecs.open(sys.stdin, encoding = 'utf-8', errors = 'ignore')
text = ""
if self.output_filename :
output_file = codecs.open(self.output_filename, encoding = 'utf-8', errors = 'ignore', mode='w+')
while 1:
text =ascii_file.readline()
if text == "":
break
unicode_text = ""
unicode_text = self.word2Unicode(text)
if self.output_filename :
output_file.write(unicode_text)
else:
print unicode_text.encode('utf-8')
return 0
def getVowelSign(self, vowel_letter, vowel_sign_letter):
vowel= vowel_letter.encode('utf-8')
vowel_sign= vowel_sign_letter.encode('utf-8')
if vowel == "à´Ž":
if vowel_sign == "െ":
return "à´"
if vowel == "à´’":
if vowel_sign == "à´¾":
return "à´“"
if vowel_sign =="ൗ":
return "à´”"
return (vowel_letter+ vowel_sign_letter)
def isPrebase(self, letter):
unicode_letter = letter.encode('utf-8')
if( ( unicode_letter == "േ" ) | ( unicode_letter == "ൈ" ) | ( unicode_letter == "ൊ" ) | ( unicode_letter == "ോ" ) | ( unicode_letter == "ൌ" )
| ( unicode_letter == "àµà´°" ) | ( unicode_letter == "െ" )
):
return True
else:
return False
def isPostbase(self, letter):
unicode_letter = letter.encode('utf-8')
if ( (unicode_letter == "àµà´¯") | (unicode_letter == "àµà´µ") ):
return True
else:
return False
def LoadRules(self):
if(self.rulesDict):
return self.rulesDict
rules_dict = dict()
line = []
line_number = 0
rules_file = codecs. open(self.mapping_filename,encoding='utf-8', errors='ignore')
while 1:
''' Keep the line number. Required for error reporting'''
line_number = line_number +1
text = unicode( rules_file.readline())
if text == "":
break
'''Ignore the comments'''
if text[0] == '#':
continue
line = text.strip()
if(line == ""):
continue
if(len(line.split("=")) != 2):
print "Error: Syntax Error in the Ascii to Unicode Map in line number ", line_number
print "Line: "+ text
return 2 # Error - Syntax error in Mapping file
lhs = line.split("=") [ 0 ]
rhs = line.split("=") [ 1 ]
if self.direction == 'a2u':
rules_dict[lhs]=rhs
else:
rules_dict[rhs]=lhs
return rules_dict
更新:我认为我对ascii部分的看法不对。它是用非英文字体书写的文字。我想将其转换为unicode字体,以便正确显示在
上答案 0 :(得分:0)
ASCII实际上只是7位。猜测是您可能正在处理ISO-8859-1,要转换为UTF-8或其他unicode编码。可以使用Iconv:
答案 1 :(得分:0)
请记住,PHP str_replace将从左到右替换。 在这种情况下,只是按降序对原始值数组进行排序的问题,例如, 'ss'在's'之前得到满足(否则'ss'将被'àμƒàμƒ'代替'àμ')。
$original = array("€", "Å“", "Å’", "ž", "Ž", "Ÿ", "Å¡", "Å ", "Ù", "À", "Û", "Ë", "É", "Ã…", "Õ", "Ç", "Æ", "Ä", "Ô", "Ó", "Â", "Ã’", "Ñ", "×", "Ö", "Ø", "È", "Ã", "Þ", "ÃŽ", "ß", "Ú", "Ê", "Ãœ", "ÃŒ", "þ", "õ", "ô", "ó", "ò", "ñ", "ð", "ï", "î", "Ã¥", "ä", "ã", "â", "á", "à ", "Ã", "Ã", "Ã", "Ã", "Ã", "¿", "¾", "½", "¼", "»", "º", "¹", "¸", "·", "¶", "µ", "´", "³", "²", "±", "°", "¯", "®", "¬", "«", "ª", "©", "¨", "§", "¦", "Â¥", "¤", "£", "¢", "¡", "Â", "}", "|", "{", "z", "y", "x", "w", "v", "u", "tm", "t", "su", "ss", "sm", "sF", "s", "r", "q", "p", "o", "n", "m", "l", "k", "j", "i", "h", "g", "f", "e", "d", "c", "b", "a", "`", "_", "^", "]", "\p", "\", "[", "Z", "Y", "X", "W", "V", "U", "T", "S", "R", "Q", "P", "O", "N", "M", "L", "K", "J", "I", "Hu", "Hm", "H", "G", "F", "E", "Du", "D", "Cu", "C", "B", "A", "$");
$replaced = array("à´—àµà´—", "à´®àµà´®", "à´®àµà´ª", "à´ªàµà´ª", "à´¨àµà´¤", "à´®àµà´²", "à´šàµà´š", "à´™àµà´•", "à´¸àµà´¥", "à´¨àµà´¦", "à´¤àµà´", "à´¹àµà´²", "à´³àµà´³", "à´®àµà´®", "à´žàµà´š", "à´¯àµà´¯", "à´µàµà´µ", "à´®àµà´ª", "à´¨àµà´§", "à´¹àµà´¨", "à´¨àµà´±", "à´¹àµà´®", "à´šàµà´›", "à´£àµà´®", "à´œàµà´œ", "à´¸àµà´¥", "à´²àµà´²", "à´ªàµà´ª", "à´£àµà´¡", "à´•àµà´Ÿ", "à´¤àµà´®", "à´œàµà´ž", "à´±àµà´±", "à´—àµà´®", "à´±àµà´±", "-", "à´", "à´¸àµà´¸", "à´¨àµà´¨", "à´¨àµà´®", "à´²àµà´²", "à´²àµâ€", "à´¨àµà´®", "à´¨àµ", "à´±àµà´±", "à´·àµà´Ÿ", "à´¨àµà´±", "à´—àµà´¨", "à´£àµà´Ÿ", "à´•àµà´¤", "à´¶àµà´š", "à´¨àµà´¤", "à´¬àµà´§", "à´¡àµà´¡", "à´¨àµà´¨", "à´¤àµà´¤", "à´£àµà´Ÿ", "à´Ÿàµà´Ÿ", "à´žàµà´ž", "à´šàµà´š", "à´™àµà´™", "à´™àµà´•", "à´•àµà´·", "à´¨àµà´®", "à´¨àµà´¨", "à´¨àµà´¦", "à´¨àµà´¤", "à´³àµâ€", "à´²àµâ€", "à´°àµâ€", "à´¨àµâ€", "à´¤àµà´¤", "à´£àµà´£", "à´£àµà´Ÿ", "à´£àµâ€", "à´Ÿàµà´Ÿ", "à´¦àµà´§", "à´žàµà´š", "à´“", "à´™àµà´™", "à´™àµà´•", "à´¦àµà´¦", "à´ˆ", "à´•àµà´·", "à´•àµà´²", "à´•àµà´•", "àµà´°", "àµà´µ", "àµà´°", "àµà´µ", "àµà´¯", "à´ƒ", "à´‚", "ൌ", "േ", "ോ", "െ", "ൌ", "ൈ", "ൊ", "à´", "ൃ", "ൂ", "àµ", "ീ", "à´¿", "à´¾", "àµ", "à´±", "à´´", "à´³", "à´¹", "à´¸", "à´·", "à´¶", "à´µ", "à´²", "à´°", "à´¯", "à´®", "à´", "à´¬", "à´«", "à´ª", "à´Œ", "à´¨", "à´§", "à´¦", "à´¥", "à´¤", "à´£", "à´¢", "à´¡", "à´ ", "à´Ÿ", "à´ž", "à´", "à´œ", "à´›", "à´š", "à´™", "à´˜", "à´—", "à´–", "à´•", "à´”", "à´“", "à´’", "à´", "à´Ž", "à´‹", "à´Š", "à´‰", "à´ˆ", "à´‡", "à´†", "à´…", "à´¸àµà´±àµà´±");
$new_string = str_replace($original, $replaced, $old_string);
注意:charset应该是windows-1252,而不是UTF-8,因为带有数组的str_replace是多字节替换的不可预测的。