将重音转换为UTF-8字符

时间:2016-06-14 07:37:55

标签: r encoding character-encoding diacritics non-ascii-characters

我有各种变音符号的Rnw / latex文档:Kri\'{s}hnaSt\"{o}ne等。

当我编织文档时,他们正确地翻译为Kri ́shnaSt ̈one等。

R中是否有一个命令或任何R包可以为我做这个转换?

例如:

conv("Kri\'{s}hna", from, to)

应该返回Kri ́shna

我花了很多时间尝试使用iconv功能而没有成功。

3 个答案:

答案 0 :(得分:0)

您还可以使用包stringr

library(stringr)

str_replace_all(c("Kri\'{s}hna"), fixed("\'{s}"), " ś")
[1] "Kri śhna"

str_replace_all(c("St\"{o}ne"), fixed("\"{o}"), " ö")
[1] "St öne"

答案 1 :(得分:0)

修改

不知道它是否可以帮助你,但我找到了一个可以将带有特殊字符的文本转换为LaTeX文本的软件包。
它来自dplR包,函数为latexify

latexify("STöne",doublebackslash=F)
[1] "ST\\\"{o}n\\^{e}"

因为不知道是否存在任何包,我建议你创建自己的函数。

convertForMe<-function(text){
textToConvert<-text
textToConvert<-grep("any_pattern","any_replacement",textToConvert)
...
}

答案 2 :(得分:0)

以下代码遵循Bolker关于撤销latexify的建议。并非所有字符都被处理,但它可以用于我的目的。

# # List taken from dplR:::latexify

substitutions <- list(
  diaeresis = c("̈", "\""), 
  acute = c("́", "'"), 
  dotabove = c("̇", "."), 
  macron = c("̄", "="), 
  circumflex = c("̂", "^"), 
  grave = c("̀", "`"), tilde = c("̃",   "~"), 
  doubleacute = c("̋", "H"), 
  ringabove = c("̊",  "r"), 
  breve = c("̆", "u"), 
  caron = c("̌", "v"), 
  invbreve = c("̑", "newtie"),
  macronbelow = c("̱", "b"), 
  cedilla = c("̧", "c"), 
  dotbelow = c("̣", "d"), 
  tie = c("͡", "t"), 
  ogonek = c("̨", "k"))

substitutions = lapply(substitutions, function(X){
  c(paste0("\\1",X[1]),
  paste0("\\\\",X[2],"\\{([aeiouyw])\\}"))
})

substitutions <- c(substitutions, list(c("¡", "\\\\textexclamdown\\{\\}"), 
                                       c("£", "\\\\pounds\\{\\}"),
                                       c("§", "\\\\S\\{\\}"),
                                       c("©", "\\\\copyright\\{\\}"),
                                       c("ª", "\\\\textordfeminine\\{\\}"), 
                                       c("®", "\\\\textregistered\\{\\}"),
                                       c("¶", "\\\\P\\{\\}"), 
                                       c("·", "\\\\textperiodcentered\\{\\}"),
                                       c("º", "\\\\textordmasculine\\{\\}"), 
                                       c("¿", "\\\\textquestiondown\\{\\}"),
                                       c("–", "\\\\textendash\\{\\}"), 
                                       c("—", "\\\\textemdash\\{\\}"),
                                       c("‘", "\\\\textquoteleft\\{\\}"), 
                                       c("’", "\\\\textquoteright\\{\\}"),
                                       c("“", "\\\\textquotedblleft\\{\\}"), 
                                       c("”", "\\\\textquotedblright\\{\\}"),
                                       c("†", "\\\\dag\\{\\}"), 
                                       c("‡", "\\\\ddag\\{\\}"),
                                       c("•", "\\\\textbullet\\{\\}"), 
                                       c("…", "\\\\dots\\{\\}"),
                                       c("™", "\\\\texttrademark\\{\\}"), 
                                       c("␣", "\\\\textvisiblespace\\{\\}"),
                                       c("Æ", "\\\\AE\\{\\}"), 
                                       c("æ", "\\\\ae\\{\\}"),
                                       c("Œ", "\\\\OE\\{\\}"),
                                       c("œ", "\\\\oe\\{\\}"), 
                                       c("Ø", "\\\\O\\{\\}"),
                                       c("ø", "\\\\o\\{\\}"),
                                       c("Ł", "\\\\L\\{\\}"), 
                                       c("ł", "\\\\l\\{\\}"),
                                       c("ß", "\\\\ss\\{\\}")))

substitutions <- c(substitutions, list(c("Ð", "\\\\DH\\{\\}"), 
                                       c("ð", "\\\\dh\\{\\}"),
                                       c("Đ", "\\\\DJ\\{\\}"),
                                       c("đ", "\\\\dj\\{\\}"), 
                                       c("Ŋ", "\\\\NG\\{\\}"),
                                       c("ŋ", "\\\\ng\\{\\}"),
                                       c("Þ", "\\\\TH\\{\\}"), 
                                       c("þ", "\\\\th\\{\\}"),
                                       c("«", "\\\\guillemotleft\\{\\}"), 
                                       c("»", "\\\\guillemotright\\{\\}"),
                                       c("‚", "\\\\quotesinglbase\\{\\}"), 
                                       c("„", "\\\\quotedblbase\\{\\}"),
                                       c("‹", "\\\\guilsinglleft\\{\\}"), 
                                       c("›", "\\\\guilsinglright\\{\\}")))
substitutions <- c(substitutions, list(
                                       c("∗", "\\\\textasteriskcentered\\{\\}"), 
                                       c("‖", "\\\\textbardbl\\{\\}"),
                                       c("◯", "\\\\textbigcircle\\{\\}"), 
                                       c("␢", "\\\\textblank\\{\\}"),
                                       c("¦", "\\\\textbrokenbar\\{\\}"), 
                                       c("⁒", "\\\\textdiscount\\{\\}"),
                                       c("℮", "\\\\textestimated\\{\\}"), 
                                       c("‽", "\\\\textinterrobang\\{\\}"),
                                       c("\u2e18", "\\\\textinterrobangdown\\{\\}"), 
                                       c("№", "\\\\textnumero\\{\\}"),
                                       c("◦", "\\\\textopenbullet\\{\\}"), 
                                       c("‰", "\\\\textperthousand\\{\\}"),
                                       c("‱", "\\\\textpertenthousand\\{\\}"), 
                                       c("℞", "\\\\textrecipe\\{\\}"),
                                       c("※", "\\\\textreferencemark\\{\\}"), 
                                       c("\u02f7", "\\\\texttildelow\\{\\}"),
                                       c("←", "\\\\textleftarrow\\{\\}"), 
                                       c("↑", "\\\\textuparrow\\{\\}"),
                                       c("→", "\\\\textrightarrow\\{\\}"), 
                                       c("↓", "\\\\textdownarrow\\{\\}"),
                                       c("〈", "\\\\textlangle\\{\\}"), 
                                       c("〉", "\\\\textrangle\\{\\}"),
                                       c("〚", "\\\\textlbrackdbl\\{\\}"), 
                                       c("〛", "\\\\textrbrackdbl\\{\\}"),
                                       c("⁅", "\\\\textlquill\\{\\}"), 
                                       c("⁆", "\\\\textrquill\\{\\}"),
                                       c("℗", "\\\\textcircledP\\{\\}"), 
                                       c("℠", "\\\\textservicemark\\{\\}"),
                                       c("℃", "\\\\textcelsius\\{\\}"), 
                                       c("℧", "\\\\textmho\\{\\}"),
                                       c("µ", "\\\\textmu\\{\\}"),
                                       c("Ω",  "\\\\textohm\\{\\}"),
                                       c("฿", "\\\\textbaht\\{\\}"),
                                       c("¢", "\\\\textcent\\{\\}"),
                                       c("₡", "\\\\textcolonmonetary\\{\\}"), 
                                       c("¤", "\\\\textcurrency\\{\\}"),
                                       c("₫", "\\\\textdong\\{\\}"), 
                                       c("\u20b2", "\\\\textguarani\\{\\}"),
                                       c("₤", "\\\\textlira\\{\\}"), 
                                       c("₦", "\\\\textnaira\\{\\}"),
                                       c("₱", "\\\\textpeso\\{\\}"), 
                                       c("₩", "\\\\textwon\\{\\}"),
                                       c("¥", "\\\\textyen\\{\\}"), 
                                       c("˝", "\\\\textacutedbl\\{\\}"),
                                       c("´", "\\\\textasciiacute\\{\\}"), 
                                       c("¸", "\\\\c\\{\\}"),
                                       c("˘", "\\\\textasciibreve\\{\\}"), 
                                       c("ˇ", "\\\\textasciicaron\\{\\}"),
                                       c("¨", "\\\\textasciidieresis\\{\\}"), 
                                       c("¯", "\\\\textasciimacron\\{\\}"),
                                       c("°", "\\\\textdegree\\{\\}"), 
                                       c("÷", "\\\\textdiv\\{\\}"),
                                       c("¼", "\\\\textonequarter\\{\\}"), 
                                       c("½", "\\\\textonehalf\\{\\}"),
                                       c("¾", "\\\\textthreequarters\\{\\}"), 
                                       c("×", "\\\\texttimes\\{\\}"),
                                       c("±", "\\\\textpm\\{\\}"), 
                                       c("¹", "\\\\textonesuperior\\{\\}"),
                                       c("²", "\\\\texttwosuperior\\{\\}"), 
                                       c("³", "\\\\textthreesuperior\\{\\}"),
                                       c("⁄", "\\\\textfractionsolidus\\{\\}"), 
                                       c("√", "\\\\textsurd\\{\\}"),
                                       c("¬", "\\\\textlnot\\{\\}"), 
                                       c("−", "\\\\textminus\\{\\}")))


detexify = function(text){
  for(sub in substitutions){
    text = gsub(sub[2],sub[1], text)
  }
  return(text)
}