使用Php从字体映射转换Ascii 2 Unicode

时间:2010-10-18 04:03:42

标签: php unicode ascii

我正在尝试将ascii非英文字体中的旧文本转换为新的unicode字体。所以关键是要maped。我必须选择。首先,我有一个像这样的sample.map(txtfile)

的地图文件
w=à´‚
x=à´ƒ
A=à´…
B=à´†
C=à´‡
Cu=à´ˆ
D=à´‰
Du=à´Š
E=à´‹
\p=ഌ
F=à´Ž
G=à´
sF=à´
H=à´’
Hm=à´“
Hu=à´”
I=à´•
J=à´–

代码必须用右侧字符替换所有左侧字符。如何循环遍历每个字符并使用地图文件中的信息替换它们。我一直在尝试寻找和替换技术,但失败了。如何让php读取这个特定的map文件,哪个txt文件带有.map扩展名并循环遍历每个char并替换它而不破坏文档?

Here is complete map file

我还发现了一个python脚本来执行此操作,我无法将其移植到php。在python中非常弱 我在这里粘贴代码:

import sys 
import codecs 
import os 
from optparse import OptionParser 

class Payyan:

 def __init__(self):
  self.input_filename =""
  self.output_filename=""
  self.mapping_filename=""
  self.rulesDict=None
  self.pdf=0

 def word2ASCII(self, unicode_text):
  index = 0
  prebase_letter = ""
  ascii_text=""
  self.direction = "u2a"
  self.rulesDict = self.LoadRules()
  while index < len(unicode_text):
   '''This takes care of conjuncts '''
   for charNo in [3,2,1]:
    letter = unicode_text[index:index+charNo]
    if letter in self.rulesDict:
     ascii_letter = self.rulesDict[letter]
     letter = letter.encode('utf-8')
     '''Fixing the prebase mathra'''
     '''TODO: Make it generic , so that usable for all indian languages'''
     if letter == 'ൈ':
      ascii_text = ascii_text[:-1] + ascii_letter*2 + ascii_text[-1:]
     elif (letter == 'ോ') | (letter == 'ൊ') | (letter == 'ൌ'): #prebase+postbase mathra case
      ascii_text = ascii_text[:-1] + ascii_letter[0] + ascii_text[-1:] + ascii_letter[1]
     elif (letter == 'െ') | (letter == 'േ') |(letter == 'àµà´°'): #only prebase
      ascii_text = ascii_text[:-1] + ascii_letter + ascii_text[-1:]
     else:
      ascii_text = ascii_text + ascii_letter      
     index = index+charNo
     break
    else:
     if(charNo==1):
      index=index+1
      ascii_text = ascii_text + letter
      break;
     '''Did not get'''    
     ascii_letter = letter

  return ascii_text

 def Uni2Ascii(self):
  if self.input_filename :
   uni_file = codecs.open(self.input_filename, encoding = 'utf-8', errors = 'ignore')
  else :
   uni_file = codecs.open(sys.stdin, encoding = 'utf-8', errors = 'ignore')   
  text = ""
  if self.output_filename :
   output_file = codecs.open(self.output_filename, encoding = 'utf-8', errors = 'ignore',  mode='w+')   
  while 1:
      text =uni_file.readline()
   if text == "":
    break
   ascii_text = "" 
   ascii_text = self.word2ASCII(text)

   if self.output_filename :
    output_file.write(ascii_text)
   else:
    print ascii_text.encode('utf-8')
  return 0

 def word2Unicode(self, ascii_text):
  index = 0
  post_index = 0
  prebase_letter = ""
  postbase_letter = ""
  unicode_text = ""
  next_ucode_letter = ""
  self.direction="a2u"
  self.rulesDict = self.LoadRules()
  while index < len(ascii_text):
   for charNo in [2,1]:
    letter = ascii_text[index:index+charNo]
    if letter in self.rulesDict:
     unicode_letter = self.rulesDict[letter]
     if(self.isPrebase(unicode_letter)): 
      prebase_letter = unicode_letter
     else:
      post_index = index+charNo
      if post_index < len(ascii_text):
       letter = ascii_text[post_index]
       if letter in self.rulesDict:
        next_ucode_letter = self.rulesDict[letter]
        if self.isPostbase(next_ucode_letter):
         postbase_letter = next_ucode_letter
         index = index + 1
      if  ((unicode_letter.encode('utf-8') == "à´Ž") |
          ( unicode_letter.encode('utf-8') == "à´’" )):
       unicode_text = unicode_text + postbase_letter + self.getVowelSign(prebase_letter , unicode_letter)
      else:
       unicode_text = unicode_text + unicode_letter + postbase_letter + prebase_letter
      prebase_letter=""
      postbase_letter=""
     index = index + charNo
     break
    else:
     if charNo == 1:
      unicode_text = unicode_text + letter
      index = index + 1
      break
     unicode_letter = letter
  return unicode_text 

 def Ascii2Uni(self):
  if self.pdf :
   command = "pdftotext '" + self.input_filename +"'"
   process = os.popen(command, 'r')
   status = process.close()
   if status:
    print "The input file is a PDF file. To convert this the  pdftotext  utility is required. "
    print "This feature is available only for GNU/Linux Operating system."
    return 1 # Error - no pdftotext !
   else:
    self.input_filename =  os.path.splitext(self.input_filename)[0] + ".txt"
  if self.input_filename :
   ascii_file = codecs.open(self.input_filename, encoding = 'utf-8', errors = 'ignore')
  else :
   ascii_file = codecs.open(sys.stdin, encoding = 'utf-8', errors = 'ignore')   

  text = ""
  if self.output_filename :
   output_file = codecs.open(self.output_filename, encoding = 'utf-8', errors = 'ignore',  mode='w+')   

  while 1:
      text =ascii_file.readline()
   if text == "":
    break
   unicode_text = ""
   unicode_text = self.word2Unicode(text)

   if self.output_filename :
    output_file.write(unicode_text)
   else:
    print unicode_text.encode('utf-8')
  return 0

 def getVowelSign(self, vowel_letter, vowel_sign_letter):
  vowel=  vowel_letter.encode('utf-8')
  vowel_sign=  vowel_sign_letter.encode('utf-8')
  if vowel == "à´Ž":
   if vowel_sign == "െ":
    return "à´"
  if vowel == "à´’":
   if vowel_sign == "à´¾":
    return "à´“"
   if vowel_sign =="ൗ":
    return "à´”"
  return (vowel_letter+ vowel_sign_letter)

 def isPrebase(self, letter):
   unicode_letter = letter.encode('utf-8')
   if(   ( unicode_letter == "േ"  ) | (   unicode_letter ==  "ൈ" ) |   ( unicode_letter ==  "ൊ" )  | ( unicode_letter ==  "ോ"  ) |  ( unicode_letter == "ൌ"  )
      |  ( unicode_letter == "àµà´°"  )  |  ( unicode_letter == "െ"  ) 
       ):
   return True
   else:
   return False

 def isPostbase(self, letter):
  unicode_letter = letter.encode('utf-8')
  if ( (unicode_letter == "àµà´¯") | (unicode_letter == "àµà´µ") ):
   return True
  else:
   return False

 def LoadRules(self): 
  if(self.rulesDict):
   return self.rulesDict
  rules_dict = dict()
  line = []
  line_number = 0
  rules_file = codecs. open(self.mapping_filename,encoding='utf-8', errors='ignore')
  while 1:
   ''' Keep the line number. Required for error reporting'''
   line_number = line_number +1 
      text = unicode( rules_file.readline())
   if text == "":
         break
   '''Ignore the comments'''
   if text[0] == '#': 
         continue 
   line = text.strip()
   if(line == ""):
      continue 
   if(len(line.split("=")) != 2):
     print "Error: Syntax Error in the Ascii to Unicode Map in line number ",  line_number
       print "Line: "+ text
       return 2 # Error - Syntax error in Mapping file 
    lhs = line.split("=") [ 0 ]  
    rhs = line.split("=") [ 1 ]  
   if self.direction == 'a2u':
    rules_dict[lhs]=rhs
   else:
    rules_dict[rhs]=lhs
  return rules_dict

更新:我认为我对ascii部分的看法不对。它是用非英文字体书写的文字。我想将其转换为unicode字体,以便正确显示在

2 个答案:

答案 0 :(得分:0)

ASCII实际上只是7位。猜测是您可能正在处理ISO-8859-1,要转换为UTF-8或其他unicode编码。可以使用Iconv:

http://php.net/manual/en/function.iconv.php

答案 1 :(得分:0)

请记住,PHP str_replace将从左到右替换。 在这种情况下,只是按降序对原始值数组进行排序的问题,例如, 'ss'在's'之前得到满足(否则'ss'将被'àμƒàμƒ'代替'àμ')。

$original = array("€", "Å“", "Å’", "ž", "Ž", "Ÿ", "Å¡", "Å ", "Ù", "À", "Û", "Ë", "É", "Ã…", "Õ", "Ç", "Æ", "Ä", "Ô", "Ó", "Â", "Ã’", "Ñ", "×", "Ö", "Ø", "È", "Ã", "Þ", "ÃŽ", "ß", "Ú", "Ê", "Ãœ", "ÃŒ", "þ", "õ", "ô", "ó", "ò", "ñ", "ð", "ï", "î", "Ã¥", "ä", "ã", "â", "á", "à ", "Ã", "Ã", "Ã", "Ã", "Ã", "¿", "¾", "½", "¼", "»", "º", "¹", "¸", "·", "¶", "µ", "´", "³", "²", "±", "°", "¯", "®", "¬", "«", "ª", "©", "¨", "§", "¦", "Â¥", "¤", "£", "¢", "¡", "­", "}", "|", "{", "z", "y", "x", "w", "v", "u", "tm", "t", "su", "ss", "sm", "sF", "s", "r", "q", "p", "o", "n", "m", "l", "k", "j", "i", "h", "g", "f", "e", "d", "c", "b", "a", "`", "_", "^", "]", "\p", "\", "[", "Z", "Y", "X", "W", "V", "U", "T", "S", "R", "Q", "P", "O", "N", "M", "L", "K", "J", "I", "Hu", "Hm", "H", "G", "F", "E", "Du", "D", "Cu", "C", "B", "A", "$");
$replaced = array("à´—àµà´—", "à´®àµà´®", "à´®àµà´ª", "à´ªàµà´ª", "à´¨àµà´¤", "à´®àµà´²", "à´šàµà´š", "à´™àµà´•", "à´¸àµà´¥", "à´¨àµà´¦", "à´¤àµà´­", "à´¹àµà´²", "à´³àµà´³", "à´®àµà´®", "à´žàµà´š", "à´¯àµà´¯", "à´µàµà´µ", "à´®àµà´ª", "à´¨àµà´§", "à´¹àµà´¨", "à´¨àµà´±", "à´¹àµà´®", "à´šàµà´›", "à´£àµà´®", "à´œàµà´œ", "à´¸àµà´¥", "à´²àµà´²", "à´ªàµà´ª", "à´£àµà´¡", "à´•àµà´Ÿ", "à´¤àµà´®", "à´œàµà´ž", "à´±àµà´±", "à´—àµà´®", "à´±àµà´±", "-", "à´­", "à´¸àµà´¸", "à´¨àµà´¨", "à´¨àµà´®", "à´²àµà´²", "à´²àµâ€", "à´¨àµà´®", "à´¨àµ", "à´±àµà´±", "à´·àµà´Ÿ", "à´¨àµà´±", "à´—àµà´¨", "à´£àµà´Ÿ", "à´•àµà´¤", "à´¶àµà´š", "à´¨àµà´¤", "à´¬àµà´§", "à´¡àµà´¡", "à´¨àµà´¨", "à´¤àµà´¤", "à´£àµà´Ÿ", "à´Ÿàµà´Ÿ", "à´žàµà´ž", "à´šàµà´š", "à´™àµà´™", "à´™àµà´•", "à´•àµà´·", "à´¨àµà´®", "à´¨àµà´¨", "à´¨àµà´¦", "à´¨àµà´¤", "à´³àµâ€", "à´²àµâ€", "à´°àµâ€", "à´¨àµâ€", "à´¤àµà´¤", "à´£àµà´£", "à´£àµà´Ÿ", "à´£àµâ€", "à´Ÿàµà´Ÿ", "à´¦àµà´§", "à´žàµà´š", "à´“", "à´™àµà´™", "à´™àµà´•", "à´¦àµà´¦", "à´ˆ", "à´•àµà´·", "à´•àµà´²", "à´•àµà´•", "àµà´°", "àµà´µ", "àµà´°", "àµà´µ", "àµà´¯", "à´ƒ", "à´‚", "ൌ", "േ", "ോ", "െ", "ൌ", "ൈ", "ൊ", "à´", "ൃ", "ൂ", "àµ", "ീ", "à´¿", "à´¾", "àµ", "à´±", "à´´", "à´³", "à´¹", "à´¸", "à´·", "à´¶", "à´µ", "à´²", "à´°", "à´¯", "à´®", "à´­", "à´¬", "à´«", "à´ª", "à´Œ", "à´¨", "à´§", "à´¦", "à´¥", "à´¤", "à´£", "à´¢", "à´¡", "à´ ", "à´Ÿ", "à´ž", "à´", "à´œ", "à´›", "à´š", "à´™", "à´˜", "à´—", "à´–", "à´•", "à´”", "à´“", "à´’", "à´", "à´Ž", "à´‹", "à´Š", "à´‰", "à´ˆ", "à´‡", "à´†", "à´…", "à´¸àµà´±àµà´±");

$new_string = str_replace($original, $replaced, $old_string);

注意:charset应该是windows-1252,而不是UTF-8,因为带有数组的str_replace是多字节替换的不可预测的。