我正在尝试理解删除和重新创建表并使用luigi将数据插入新创建的表的正确方法。我有多个CSV文件从前一个任务传递到应该插入数据库的任务。
我目前的代码是这样的。
class CreateTables(sqla.CopyToTable):
connection_string = DatabaseConfig().data_mart_connection_string
table = DatabaseConfig().table_name
def requires(self):
return CustomerJourneyToCSV()
def output(self):
return SQLAlchemyTarget(
connection_string=self.connection_string,
target_table="customerJourney_1",
update_id=self.update_id(),
connect_args=self.connect_args,
echo=self.echo)
def create_table(self, engine):
base = automap_base()
Session = sessionmaker(bind=engine)
session = Session()
metadata = MetaData(engine)
base.prepare(engine, reflect=True)
# Drop existing tables
for i in range(1, len(self.input())+1):
for t in base.metadata.sorted_tables:
if t.name in "{0}_{1}".format(self.table, i):
t.drop(engine)
# Create new tables and insert data
i = 1
for f in self.input():
df = pd.read_csv(f.path, sep="|")
df.fillna(value="", inplace=True)
ts = define_table_schema(df)
t = Table("{0}_{1}".format(self.table, i), metadata, *[Column(*c[0], **c[1]) for c in ts])
t.create(engine)
# TODO: Need to remove head and figure out how to stop the connection from timing out
my_insert = t.insert().values(df.head(500).to_dict(orient="records"))
session.execute(my_insert)
i +=1
session.commit()
代码可以创建表并插入数据,但会出现以下错误。
C:\Users\simon\AdviceDataMart\lib\site-packages\luigi\worker.py:191:
DtypeWarning: Columns (150) have mixed types. Specify dtype option on import
or set low_memory=False.
new_deps = self._run_get_new_deps()
File "C:\Users\simon\AdviceDataMart\lib\site-packages\luigi\worker.py", line
191, in run
new_deps = self._run_get_new_deps()
File "C:\Users\simon\AdviceDataMart\lib\site-packages\luigi\worker.py", line
129, in _run_get_new_deps
task_gen = self.task.run()
File "C:\Users\simon\AdviceDataMart\lib\site-
packages\luigi\contrib\sqla.py", line 375, in run
for row in itertools.islice(rows, self.chunk_size)]
File "C:\Users\simon\AdviceDataMart\lib\site-
packages\luigi\contrib\sqla.py", line 363, in rows
with self.input().open('r') as fobj:
AttributeError: 'list' object has no attribute 'open'
我不确定是什么导致这种情况,并且无法轻松调试luigi管道。我不确定这是否与run方法或输出方法的实现有关?