什么是取消数字HTML / XML实体的良好实现,例如
并用等效的ASCII替换它们?
表示为单元测试:
local orig = "It's the "end" &ok; "
local fixd = unescape(orig) -- Implement this
assert( fixd == "It's the \"end\" &ok;\n" )
答案 0 :(得分:8)
这是一个简单的实现,它还处理名为XML实体的核心:
function unescape(str)
str = string.gsub( str, '<', '<' )
str = string.gsub( str, '>', '>' )
str = string.gsub( str, '"', '"' )
str = string.gsub( str, ''', "'" )
str = string.gsub( str, '&#(%d+);', function(n) return string.char(n) end )
str = string.gsub( str, '&#x(%d+);', function(n) return string.char(tonumber(n,16)) end )
str = string.gsub( str, '&', '&' ) -- Be sure to do this after all others
return str
end
print(unescape(""Hello" 'World'")) --> "Hello" 'World'
但请注意,对于一个病态情况,这是失败的:数字&符号实体后跟文本amp;
:
print(unescape("Ampersand entity is &amp;")) --> Ampersand entity is &
-- The result should actually be Ampersand entity is &
我们可以通过一次处理所有实体来修复此边缘情况,但代码变得更加丑陋:
function unescape(str)
local map={ ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
str = string.gsub( str, '(&(#?x?)([%d%a]+);)', function(orig,n,s)
return (n=='' and map[s])
or (n=="#x" and tonumber(s,16)) and string.char(tonumber(s,16))
or (n=="#" and tonumber(s)) and string.char(s)
or orig
end )
return str
end
print(unescape("Ampersand entity is &amp;")) --> Ampersand entity is &
最后,我们可以以更快的速度打开它:
local gsub, char = string.gsub, string.char
local entityMap = {["lt"]="<",["gt"]=">",["amp"]="&",["quot"]='"',["apos"]="'"}
local entitySwap = function(orig,n,s)
return (n=='' and entityMap[s])
or (n=="#" and tonumber(s)) and string.char(s)
or (n=="#x" and tonumber(s,16)) and string.char(tonumber(s,16))
or orig
end
function unescape(str)
return (gsub( str, '(&(#?x?)([%d%a]+);)', entitySwap ))
end
答案 1 :(得分:2)
对于少数在下载法语html内容时可能需要逃避重音的程序员,这里是上述功能的更广泛版本。
local function unescape(str)
str = string.gsub( str, ' ', ' ')
str = string.gsub( str, '¡', '¡')
str = string.gsub( str, '¢', '¢')
str = string.gsub( str, '£', '£')
str = string.gsub( str, '¤', '¤')
str = string.gsub( str, '¥', '¥')
str = string.gsub( str, '¦', '¦')
str = string.gsub( str, '§', '§')
str = string.gsub( str, '¨', '¨')
str = string.gsub( str, '©', '©')
str = string.gsub( str, 'ª', 'ª')
str = string.gsub( str, '«', '«')
str = string.gsub( str, '¬', '¬')
str = string.gsub( str, '­', '')
str = string.gsub( str, '®', '®')
str = string.gsub( str, '¯', '¯')
str = string.gsub( str, '°', '°')
str = string.gsub( str, '±', '±')
str = string.gsub( str, '²', '²')
str = string.gsub( str, '³', '³')
str = string.gsub( str, '´', '´')
str = string.gsub( str, 'µ', 'µ')
str = string.gsub( str, '¶', '¶')
str = string.gsub( str, '·', '·')
str = string.gsub( str, '¸', '¸')
str = string.gsub( str, '¹', '¹')
str = string.gsub( str, 'º', 'º')
str = string.gsub( str, '»', '»')
str = string.gsub( str, '¼', '¼')
str = string.gsub( str, '½', '½')
str = string.gsub( str, '¾', '¾')
str = string.gsub( str, '¿', '¿')
str = string.gsub( str, 'À', 'À')
str = string.gsub( str, 'Á', 'Á')
str = string.gsub( str, 'Â', 'Â')
str = string.gsub( str, 'Ã', 'Ã')
str = string.gsub( str, 'Ä', 'Ä')
str = string.gsub( str, 'Å', 'Å')
str = string.gsub( str, 'Æ', 'Æ')
str = string.gsub( str, 'Ç', 'Ç')
str = string.gsub( str, 'È', 'È')
str = string.gsub( str, 'É', 'É')
str = string.gsub( str, 'Ê', 'Ê')
str = string.gsub( str, 'Ë', 'Ë')
str = string.gsub( str, 'Ì', 'Ì')
str = string.gsub( str, 'Í', 'Í')
str = string.gsub( str, 'Î', 'Î')
str = string.gsub( str, 'Ï', 'Ï')
str = string.gsub( str, 'Ð', 'Ð')
str = string.gsub( str, 'Ñ', 'Ñ')
str = string.gsub( str, 'Ò', 'Ò')
str = string.gsub( str, 'Ó', 'Ó')
str = string.gsub( str, 'Ô', 'Ô')
str = string.gsub( str, 'Õ', 'Õ')
str = string.gsub( str, 'Ö', 'Ö')
str = string.gsub( str, '×', '×')
str = string.gsub( str, 'Ø', 'Ø')
str = string.gsub( str, 'Ù', 'Ù')
str = string.gsub( str, 'Ú', 'Ú')
str = string.gsub( str, 'Û', 'Û')
str = string.gsub( str, 'Ü', 'Ü')
str = string.gsub( str, 'Ý', 'Ý')
str = string.gsub( str, 'Þ', 'Þ')
str = string.gsub( str, 'ß', 'ß')
str = string.gsub( str, 'à', 'à')
str = string.gsub( str, 'á', 'á')
str = string.gsub( str, 'â', 'â')
str = string.gsub( str, 'ã', 'ã')
str = string.gsub( str, 'ä', 'ä')
str = string.gsub( str, 'å', 'å')
str = string.gsub( str, 'æ', 'æ')
str = string.gsub( str, 'ç', 'ç')
str = string.gsub( str, 'è', 'è')
str = string.gsub( str, 'é', 'é')
str = string.gsub( str, 'ê', 'ê')
str = string.gsub( str, 'ë', 'ë')
str = string.gsub( str, 'ì', 'ì')
str = string.gsub( str, 'í', 'í')
str = string.gsub( str, 'î', 'î')
str = string.gsub( str, 'ï', 'ï')
str = string.gsub( str, 'ð', 'ð')
str = string.gsub( str, 'ñ', 'ñ')
str = string.gsub( str, 'ò', 'ò')
str = string.gsub( str, 'ó', 'ó')
str = string.gsub( str, 'ô', 'ô')
str = string.gsub( str, 'õ', 'õ')
str = string.gsub( str, 'ö', 'ö')
str = string.gsub( str, '÷', '÷')
str = string.gsub( str, 'ø', 'ø')
str = string.gsub( str, 'ù', 'ù')
str = string.gsub( str, 'ú', 'ú')
str = string.gsub( str, 'û', 'û')
str = string.gsub( str, 'ü', 'ü')
str = string.gsub( str, 'ý', 'ý')
str = string.gsub( str, 'þ', 'þ')
str = string.gsub( str, 'ÿ', 'ÿ')
str = string.gsub( str, '€', '€')
str = string.gsub( str, '&#(%d+);', function(n) return string.char(n) end )
str = string.gsub( str, '&#x(%d+);', function(n) return string.char(tonumber(n,16)) end )
str = string.gsub( str, '&', '&' ) -- Be sure to do this after all others
return str
end
答案 2 :(得分:0)
现在通过LuaRocks提供了htmlEntities-for-lua模块,您应该改用它。它只在代码中使用3个string.gsub迭代,因此和更快,更完整。
htmlEntities = require('htmlEntities')
print(htmlEntities.decode(yourString))
或将其用作当前功能的替代品:
myUnescape = require('htmlEntities').decode