Python eTree Parser不附加元素

时间:2013-05-09 17:30:41

标签: python html-parsing psycopg2 xml.etree

查看我的日志,看看它是如何说我从Postgres返回的行已经从字符串转换为元素(我打印字符串,打印元素,打印isElement布尔值!)然后当我尝试追加它时,错误是它不是一个元素。哈夫,噗。

import sys
from HTMLParser import HTMLParser
from xml.etree import cElementTree as etree
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, SubElement, tostring
import psycopg2
import psycopg2.extras

def main():
    # Connect to an existing database
    conn = psycopg2.connect(dbname="**", user="**", password="**", host="/tmp/", port="**")

    # Open a cursor to perform database operations
    cur = conn.cursor(cursor_factory = psycopg2.extras.RealDictCursor)

    cur.execute("SELECT * FROM landingpagedata;")
    rows = cur.fetchall()

    class LinksParser(HTMLParser):
      def __init__(self):
          HTMLParser.__init__(self)
          self.tb = etree.TreeBuilder()

      def handle_starttag(self, tag, attributes):
          self.tb.start(tag, dict(attributes))

      def handle_endtag(self, tag):
          self.tb.end(tag)

      def handle_data(self, data):
          self.tb.data(data)

      def close(self):
          HTMLParser.close(self)
          return self.tb.close()

    template = 'template.html'



    # parser.feed(open('landingIndex.html').read()) #for testing
    # root = parser.close()

    for row in rows:
        parser = LinksParser()

        parser.feed(open(template).read())
        root = parser.close()




        #title
        title = root.find(".//title")
        title.text = row['title']

        #headline
        h1_id_headline = root.find(".//h1")
        h1_id_headline.text = row['h1_id_headline']
        # print row['h1_id_headline']

        #intro
        p_class_intro = root.find(".//p[@class='intro']")
        p_class_intro.text = row['p_class_intro']
        # print row['p_class_intro']

这是问题发生的地方!

        #recommended
        p_class_recommendedbackground = root.find(".//div[@class='recommended_background_div']")
        print p_class_recommendedbackground
        p_class_recommendedbackground.clear()
        newElement = ET.fromstring(row['p_class_recommendedbackground'])
        print row['p_class_recommendedbackground']
        print ET.iselement(newElement)
        p_class_recommendedbackground.append(newElement)

        html = tostring(root)
        f = open(row['page_name'], 'w').close()
        f = open(row['page_name'], 'w')
        f.write(html)
        f.close()
        # f = ''
        # html = ''
        parser.reset()
        root = ''

    # Close communication with the database
    cur.close()
    conn.close()

if __name__ == "__main__":
  main()

我的日志是这样的:

{background: url(/images/courses/azRealEstate.png) center no-repeat;}
<Element 'div' at 0x10a999720>
<p class="recommended_background">Materials are are aimed to all aspiring real estate sales associates who wish to obtain the Arizona Real Estate Salesperson license, which is provided by the <a href="http://www.re.state.az.us/" style="text-decoration: underline;">Arizona Department of Real Estate</a>.</p>
True
Traceback (most recent call last):
  File "/Users/Morgan13/Programming/LandingPageBuilder/landingPages/landingBuilderTest.py", line 108, in <module> main()
  File "/Users/Morgan13/Programming/LandingPageBuilder/landingPages/landingBuilderTest.py", line 84, in main
    p_class_recommendedbackground.append(newElement)
TypeError: must be Element, not Element
[Finished in 0.1s with exit code 1]

1 个答案:

答案 0 :(得分:1)

我可以通过这种方式重现错误消息:

from xml.etree import cElementTree as etree
import xml.etree.ElementTree as ET

croot = etree.Element('root')
child = ET.Element('child')
croot.append(child)
# TypeError: must be Element, not Element

问题的根本原因是我们将cElementTree ElementTree实施与xml.etree.ElementTree ElementTree实施混合在一起。两个人都不应该相遇。

所以修复只是选择一个,说etree,并替换其他所有匹配项(例如,将ET替换为etree)。