Question

我的数据库表如下所示

我有一个网络爬虫，可从网站中获取新闻，我正尝试将其存储在此表中。我使用了草率漂亮的汤类库。以下代码显示了我的搜寻器逻辑。

import requests
from bs4 import BeautifulSoup
import os
import datetime
import cx_Oracle

def scrappy(url):
    try:
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        title = soup.find('title').text.split('|')[0]
        time =soup.find('span', attrs={'class':'time_cptn'}).find_all('span')[2].contents[0]
        full_text =soup.find('div', attrs={'class':'article_content'}).text.replace('Download The Times of India News App for Latest India News','')
    except:
        return ('','','','')
    else:
        return (title,time,url,full_text)

def pathmaker(name):   
    path = "Desktop/Web_Crawler/CRAWLED_DATA/{}".format(name)

    try:  
        os.makedirs(path)
    except OSError:  
        pass
    else:  
        pass

def filemaker(folder,links_all):
    #k=1
    for link in links_all:
        scrapped=scrappy(link)    
        #textfile=open('Desktop/Web_Crawler/CRAWLED_DATA/{}/text{}.txt'.format(x,k),'w+')
        #k+=1
        Title = scrapped[0]
        Link = scrapped[2]
        Dates = scrapped[1]
        Text = scrapped[3]
        con = cx_Oracle.connect('shivams/tiger@127.0.0.1/XE')
        cursor = con.cursor()
        sql_query = "insert into newsdata values(:1,:2,:3,:4)"
        cursor.executemany(sql_query,[Title,Link,Dates,Text])
        con.commit()
        cursor.close()
        con.close()
        #textfile.write('Title\n{}\n\nLink\n{}\n\nDate & Time\n{}\n\nText\n{}'.format(scrapped[0],scrapped[2],scrapped[1],scrapped[3]))
        #textfile.close()
        con.close()



folders_links=[('India','https://timesofindia.indiatimes.com/india'),('World','https://timesofindia.indiatimes.com/world'),('Business','https://timesofindia.indiatimes.com/business'),('Homepage','https://timesofindia.indiatimes.com/')]

for x,y in folders_links:
    pathmaker(x)
    r = requests.get(y)
    soup = BeautifulSoup(r.text, 'html.parser')
    if x!='Homepage':
        links =soup.find('div', attrs={'class':'main-content'}).find_all('span', attrs={'class':'twtr'})
        links_all=['https://timesofindia.indiatimes.com'+links[x]['data-url'].split('?')[0] for x in range(len(links))]
    else:
        links =soup.find('div', attrs={'class':'wrapper clearfix'})
        total_links = links.find_all('a')
        links_all=[]
        for p in range(len(total_links)):
            if 'href' in str(total_links[p]) and '.cms'  in total_links[p]['href'] and  'http' not in total_links[p]['href'] and 'articleshow' in total_links[p]['href'] :
                links_all+=['https://timesofindia.indiatimes.com'+total_links[p]['href']]

    filemaker(x,links_all)

以前，我创建了文本文件并将新闻存储在其中，但现在我想将其存储在数据库中，以便我的Web应用程序访问它。我的数据库逻辑在文件制作功能中。我试图在表中插入值，但它不起作用并给出各种类型的错误。我关注了网站上的其他帖子，但在我的情况下它们不起作用。谁能帮我这个。另外我不确定这是否是插入CLOB数据的正确方法，因为我是第一次使用它。需要帮助。

Answer 1

您可以执行以下操作：

cursor.execute(sql_query, [Title, Link, Dates, Text])

或者，如果您建立了这些值的列表，则可以执行以下操作：

allValues = []
allValues.append([Title, Link, Dates, Text])
cursor.executemany(sql_query, allValues)

希望能说明问题！

无法使用python和oracle数据库将数据插入表中

1 个答案: