如何使用ruby和parslet解析rtf文本?

时间:2018-07-19 20:50:25

标签: ruby-on-rails ruby rtf parslet

我具有RTF文件中的以下数据:

  

{\ rtf1 \ ansi \ deff3 \ adeflang1025 \ n {\ fonttbl {\ f0 \ froman \ fprq2 \ fcharset0   Times New Roman;} {\ f1 \ froman \ fprq2 \ fcharset2   符号;} {\ f2 \ fswiss \ fprq2 \ fcharset0   Arial;} {\ f3 \ froman \ fprq2 \ fcharset128次   罗马;} {\ f4 \ fswiss \ fprq2 \ fcharset128   Arial;} {\ f5 \ fnil \ fprq2 \ fcharset128 Droid Sans   后备广告;} {\ f6 \ fnil \ fprq2 \ fcharset128 DejaVu   Sans;} {\ f7 \ fswiss \ fprq0 \ fcharset128 DejaVu   Sans;}} \ n {\ colortbl; \ red0 \ green0 \ blue0; \ red128 \ green128 \ blue128;} \ n {\ stylesheet {\ s0 \ snext0 \ nowidctlpar {\ * \ hyphen2 \ hyphlead2 \ hyphtrail2 \ hyphmax0} \\ cf0 \ kerning1 \ hich \ af5 \ langfe2052 \ dbch \ af6 \ afs24 \ lang1081 \ loch \ f3 \ fs24 \ lang1033   默认值;} \ n {\ s15 \ sbasedon0 \ snext16 \ sb240 \ sa120 \ keepn \ hich \ af5 \ dbch \ af6 \ afs28 \ loch \ f4 \ fs28   标题;} \ n {\ s16 \ sbasedon0 \ snext16 \ sb0 \ sa120文字   正文;} \ n {\ s17 \ sbasedon16 \ snext17 \ sb0 \ sa120 \ dbch \ af7   列表;} \ n {\ s18 \ sbasedon0 \ snext18 \ sb120 \ sa120 \ noline \ i \ dbch \ af7 \ afs24 \ ai \ fs24   标题;} \ n {\ s19 \ sbasedon0 \ snext19 \ noline \ dbch \ af7   索引;} \ n} {\ info {\ creatim \ yr2018 \ mo7 \ dy15 \ hr11 \ min52} {\ revtim \ yr0 \ mo0 \ dy0 \ hr0 \ min0} {\ printim \ yr0 \ mo0 \ dy0 \ hr0 \ min0 }{\评论   OpenOffice} {\ vern4140}} \ deftab709 \ n \ n {\ * \ pgdsctbl \ n {\ pgdsc0 \ pgdscuse195 \ pgwsxn12240 \ pghsxn15840 \ marglsxn1134 \ margrsxn1134 \ margtsxn1134 \ margbsxn1134 \ pgdscnxt0   默认值;}} \ n \ formshade \ paperh15840 \ paperw12240 \ margl1134 \ margr1134 \ margt1134 \ margb1134 \ sectd \ sbknone \ sectunlocked1 \ pgndec \ pgwsxn12240 \ pghsxn15840 \ marglsxn1134 \ margrsxnn1ft \ n \ t \ n \ n \ ft \ n \ ft \ n \ ft \ n \ n \ ft \ n \ ft \ n \ ft \ n \ n \ ft \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ ft \ n \ n \ aftnrstcont \ aftnstart1 \ aftnnrlc \ n \ pgndec \ pard \ plain   \ s0 \ nowidctlpar {\ * \ hyphen2 \ hyphlead2 \ hyphtrail2 \ hyphmax0} \ cf0 \ kerning1 \ hich \ af5 \ langfe2052 \ dbch \ af6 \ afs24 \ lang1081 \ loch \ f3 \ fs24 \ lang1033 {\ rtlch   \ ltrch \ loch \ n我喜欢阅读。} \ n \ par}

下面是Rob Miller的“使用Ruby进行文本处理”中的示例,下面是Parslet Parser:

require "parslet"

class Rtf < Parslet::Parser

   rule(:space)                    { str(" ") }

   rule(:hypen)                    { str("-") }

   rule(:integer)                  { match["0-9"].repeat(1) }

   rule(:newline)                  { str("\n") }

   rule(:slash)                    { str("\\") }

   rule(:letter_sequence)          { match["a-z"].repeat }

   rule(:special_chars)            { match["\\\\{}"] }

   rule(:unformatted_text)         { ( special_chars.absent? >> any ).repeat(1).as(:text) }

   rule(:control_word)             { ( slash >> 
                                       letter_sequence.as(:word) >> 
                                       control_delimiter.maybe.as(:delimiter) 
                                     ).as(:control_word) 
                                   }

   rule(:control_delimiter)        { space | ( hypen.maybe >> integer ) | str(";") }


  rule(:group)                     { 
                                      (
                                         str("{") >> 
                                         newline.maybe >>
                                         content >>
                                         newline.maybe >>
                                         str("}") 
                                      )
                                   }

  rule(:content)                   {  
                                     (
                                        unformatted_text | control_word | group
                                     ).repeat
                                   }

  rule(:header)                    {
                                      ( slash >> str("rtf") >> integer.maybe.as(:version) ).as(:rtf) >>  
                                      ( slash >> letter_sequence.as(:charset) ) >>
                                      ( slash >> str("deff") >> integer.maybe ).maybe.as(:deff) >>
                                      color_table.maybe.as(:color_table) >>
                                      newline.maybe
                                   }

  rule(:color_table)               {
                                      newline.maybe >>
                                      str("{") >>
                                      ( slash >> str("colortabl;") ) >> 
                                      color_definition.repeat(1).as(:colors) >>
                                      str("}") >>
                                      newline.maybe  
                                   }

  rule(:color_definition)          {
                                      slash >> str("red") >> (intger.as(:int)).as(:red) >>
                                      slash >> str("green") >> (intger.as(:int)).as(:green) >>
                                      slash >> str("blue") >> (intger.as(:int)).as(:blue) >>
                                      str(";")
                                   }

  rule(:file)                      {
                                      str("{") >>
                                      header.as(:header) >>
                                      content.as(:document) >>
                                      str("}") >>
                                      newline.maybe 
                                   }

  root :file

end

使用上述Parslet解析rtf文件会产生:

  

(byebug)解析了{:header => {:rtf => {:version =>“ 1” @ 5},   :charset =>“ ansi” @ 7,:deff =>“ \ deff3” @ 11,:color_table => nil},   :document => [{{:control_word => {:word =>“ adeflang” @ 18,   :delimiter =>“ 1025” @ 26}},{:text =>“ \ n” @ 30},{:text =>“ \ n” @ 374},   {:text =>“ \ n” @ 431},{:control_word => {:word =>“ deftab” @ 1050,   :delimiter =>“ 709” @ 1056}},{:text =>“ \ n \ n” @ 1059},{:text =>“ \ n” @ 1191},   {:control_word => {:word =>“ formshade” @ 1193,:delimiter => nil}},   {:control_word => {:word =>“ paperh” @ 1203,:delimiter =>“ 15840” @ 1209}},   {:control_word => {:word =>“ paperw” @ 1215,:delimiter =>“ 12240” @ 1221}},   {:control_word => {:word =>“ margl” @ 1227,:delimiter =>“ 1134” @ 1232}},   {:control_word => {:word =>“ margr” @ 1237,:delimiter =>“ 1134” @ 1242}},   {:control_word => {:word =>“ margt” @ 1247,:delimiter =>“ 1134” @ 1252}},   {:control_word => {:word =>“ margb” @ 1257,:delimiter =>“ 1134” @ 1262}},   {:control_word => {:word =>“ sectd” @ 1267,:delimiter => nil}},   {:control_word => {:word =>“ sbknone” @ 1273,:delimiter => nil}},   {:control_word => {:word =>“ sectunlocked” @ 1281,:delimiter =>“ 1” @ 1293}},   {:control_word => {:word =>“ pgndec” @ 1295,:delimiter => nil}},   {:control_word => {:word =>“ pgwsxn” @ 1302,:delimiter =>“ 12240” @ 1308}},   {:control_word => {:word =>“ pghsxn” @ 1314,:delimiter =>“ 15840” @ 1320}},   {:control_word => {:word =>“ marglsxn” @ 1326,:delimiter =>“ 1134” @ 1334}},   {:control_word => {:word =>“ margrsxn” @ 1339,:delimiter =>“ 1134” @ 1347}},   {:control_word => {:word =>“ margtsxn” @ 1352,:delimiter =>“ 1134” @ 1360}},   {:control_word => {:word =>“ margbsxn” @ 1365,:delimiter =>“ 1134” @ 1373}},   {:control_word => {:word =>“ ftnbj” @ 1378,:delimiter => nil}},   {:control_word => {:word =>“ ftnstart” @ 1384,:delimiter =>“ 1” @ 1392}},   {:control_word => {:word =>“ ftnrstcont” @ 1394,:delimiter => nil}},   {:control_word => {:word =>“ ftnnar” @ 1405,:delimiter => nil}},   {:control_word => {:word =>“ aenddoc” @ 1412,:delimiter => nil}},   {:control_word => {:word =>“ aftnrstcont” @ 1420,:delimiter => nil}},   {:control_word => {:word =>“ aftnstart” @ 1432,:delimiter =>“ 1” @ 1441}},   {:control_word => {:word =>“ aftnnrlc” @ 1443,:delimiter => nil}},   {:text =>“ \ n” @ 1451},{:control_word => {:word =>“ pgndec” @ 1453,   :delimiter => nil}},{:control_word => {:word =>“ pard” @ 1460,   :delimiter => nil}},{:control_word => {:word =>“普通” @ 1465,:delimiter =>“   “ @ 1470}},{:control_word => {:word =>” s“ @ 1472,:delimiter =>” 0“ @ 1473}},   {:control_word => {:word =>“ nowidctlpar” @ 1475,:delimiter => nil}},   {:control_word => {:word =>“ cf” @ 1529,:delimiter =>“ 0” @ 1531}},   {:control_word => {:word =>“字距调整” @ 1533,:delimiter =>“ 1” @ 1540}},   {:control_word => {:word =>“ hich” @ 1542,:delimiter => nil}},   {:control_word => {:word =>“ af” @ 1547,:delimiter =>“ 5” @ 1549}},   {:control_word => {:word =>“ langfe” @ 1551,:delimiter =>“ 2052” @ 1557}},   {:control_word => {:word =>“ dbch” @ 1562,:delimiter => nil}},   {:control_word => {:word =>“ af” @ 1567,:delimiter =>“ 6” @ 1569}},   {:control_word => {:word =>“ afs” @ 1571,:delimiter =>“ 24” @ 1574}},   {:control_word => {:word =>“ lang” @ 1577,:delimiter =>“ 1081” @ 1581}},   {:control_word => {:word =>“ loch” @ 1586,:delimiter => nil}},   {:control_word => {:word =>“ f” @ 1591,:delimiter =>“ 3” @ 1592}},   {:control_word => {:word =>“ fs” @ 1594,:delimiter =>“ 24” @ 1596}},   {:control_word => {:word =>“ lang” @ 1599,:delimiter =>“ 1033” @ 1603}},   {:text =>“ \ n” @ 1643},{:control_word => {:word =>“ par” @ 1645,:delimiter =>“   “ @ 1648}}]}

没有解析RTF文件中的任何文本,即“我喜欢阅读。”,我也不知道为什么。任何指导将不胜感激。

1 个答案:

答案 0 :(得分:1)

这是因为您缺少所有组。

layoutsubviews

添加'as(:group)'

现在您可以得到

import UIKit

class TableViewCell: UITableViewCell {

  override init(style: UITableViewCellStyle, reuseIdentifier: String?) {
    super.init(style: style, reuseIdentifier: reuseIdentifier)

    contentView.do {
      $0.backgroundColor = .white
      $0.addShadow(ofColor: .black, radius: 1.5, offset: .zero, opacity: 0.1)
      $0.cornerRadius = 6
    }
  }

  required init?(coder aDecoder: NSCoder) {
    fatalError("init(coder:) has not been implemented")
  }

  override func layoutSubviews() {
    super.layoutSubviews()

    contentView.pin.all(UIEdgeInsets(inset: 5))
  }

  override func setHighlighted(_ highlighted: Bool, animated: Bool) {
    super.setHighlighted(highlighted, animated: animated)

    contentView.backgroundColor = highlighted ? UIColor(rgb: 217) : .white
  }
}