我正在抓取一个在线表格,每15分钟更新一次。我正在尝试编写一个使用日期和时间列来检查条目是否已存在的脚本,如果不存在,请插入新数据。
但是,即使表已更新,最底部的IF语句也始终显示“无新数据可插入”。如果我完全删除了IF语句而只使用insert_data()
,它将按预期插入数据,因此我知道到目前为止的工作原理。
谁能看到哪里出了问题?
import requests
import pandas as pd
import psycopg2
import datetime
import time
from bs4 import BeautifulSoup
# GET THE DATA
url = 'https://harvardforest.fas.harvard.edu/weather/15-minute-tables-metric-units'
resp = requests.get(url)
soup = BeautifulSoup(resp.content, 'html.parser')
fishbox = soup.find('pre')
for tag in fishbox.find_all():
tag.extract()
table = fishbox.text.strip()
#tableS = table.split('\n')
print(table.split('\r\n')[-1])
# INSERT THE DATA
def create_table():
'''Create table in database.'''
try:
conn = psycopg2.connect('''
dbname = 'sensor'
user = 'user'
password = 'pass'
port = '5432'
host = 'database-1.sadasdasd.us-east-1.rds.amazonaws.com'
''')
cur = conn.cursor()
# Create table for database.
cur.execute('''create table if not exists fisherMetStation (
timestamp TIMESTAMP NOT NULL,
Date text,
Time text,
AirT_C text,
Rh text,
DewP text,
Prec text,
SlrR text,
ParR text,
NetR text,
Bar text,
Wspd text,
Wdir text,
Gspd text,
S10T text
)''')
except Exception as e:
print(e)
return
conn.commit()
conn.close()
create_table()
def insert_data(timestamp, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14):
'''Insert data into the database.'''
conn = psycopg2.connect('''
dbname = 'sensor'
user = 'user'
password = 'pass'
port = '5432'
host = 'database-1.sadasdasd.us-east-1.rds.amazonaws.com'
''')
cur = conn.cursor()
cur.execute('''INSERT INTO fisherMetStation
(timestamp, Date, Time, AirT_C, Rh, DewP, Prec, SlrR, ParR, NetR, Bar, Wspd, Wdir, Gspd, S10T)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', (timestamp, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14))
conn.commit()
conn.close()
def read_data():
'''Insert data into the database.'''
conn = psycopg2.connect('''
dbname = 'sensor'
user = 'user'
password = 'pass'
port = '5432'
host = 'database-1.sadasdasd.us-east-1.rds.amazonaws.com'
''')
cur = conn.cursor()
cur.execute('''select Date, Time from fisherMetStation ORDER BY timestamp DESC LIMIT 1''')
records = cur.fetchall()
print(records[0][0], records[0][1])
conn.commit()
conn.close()
data1 = table.split('\r\n')[-1]
data2 = data1.split()
time_now = datetime.datetime.today()
timestamp = datetime.datetime.strftime(time_now, "%Y-%m-%d %H:%M:%S")
sensor1 = data2[0]
sensor2 = data2[1]
sensor3 = data2[2]
sensor4 = data2[3]
sensor5 = data2[4]
sensor6 = data2[5]
sensor7 = data2[6]
sensor8 = data2[7]
sensor9 = data2[8]
sensor10 = data2[9]
sensor11 = data2[10]
sensor12 = data2[11]
sensor13 = data2[12]
sensor14 = data2[13]
# Save data in a Postgresql database.
while True:
# run every 10 minutes
time.sleep(600)
if read_data() != print(sensor1, sensor2):
insert_data(timestamp, sensor1, sensor2, sensor3, sensor4, sensor5, sensor6, sensor7, sensor8, sensor9, sensor10, sensor11, sensor12, sensor13, sensor14)
print("data inserted")
else:
print("no new data to insert")
答案 0 :(得分:1)
#####you should return records in read_data######
def read_data():
# your code
records = cur.fetchall()
result=[]
for rec in records[0]:
result.append(rec)
print(records[0][0], records[0][1])
conn.commit()
conn.close()
return result
#compare time in records with sensor1(time) and date in records with sensor(2) if any of them do not match then execute if
result=read_data()
if result[0] != sensor1 or result[1] != sensor2:
insert_data(timestamp, sensor1, sensor2, sensor3, sensor4, sensor5, sensor6, sensor7, sensor8, sensor9, sensor10, sensor11, sensor12, sensor13, sensor14)
print("data inserted")
else:
print("no new data to insert")