Question

我正在使用 cleanup_mysql_backups.sh 文件运行 cleanup_mysql_backups.py 文件，在我的 ubuntu 服务器上使用 crontab。我将cleanup_mysql_backups.sh 建立在另一个stocks_etl.sh 文件的基础上，我已经在使用该文件运行stocks_etl.py 文件。我也在使用 crontab 运行 stock_etl.sh 文件。我已经包含了我的 crontab 以及两个 .sh 脚本和两个 .py 脚本。下面 crontab 输出中列出的所有管道都从stocks_etl 管道中复制出来。每天在预定时间，下面的前两个管道（stocks_etl、mysql_backup）运行良好，在日志文件中创建输出并在其 python 脚本文件中完成任务。最后两个（cleanup_mysql_backups、cleanup_downloads）都在它们的 cleanup_mysql_backups.log 和 cleanup_downloads.log 日志文件中创建相同的初始输出，但随后不再创建日志并且不删除它们的 python 脚本中列出的文件和目录。如果我改为手动运行 cleanup_mysql_backups.py 代码，在 cleanup_mysql_backups.sh 文件中使用 sudo 和相同的命令，它会起作用。有没有人看到问题可能是什么并建议如何解决它。脚本和 crontab 的代码如下。

sudo crontab -e output:

# Edit this file to introduce tasks to be run by cron.
# 
# Each task to run has to be defined through a single line
# indicating with different fields when the task will be run
# and what command to run for the task
# 
# To define the time you can provide concrete values for
# minute (m), hour (h), day of month (dom), month (mon),
# and day of week (dow) or use '*' in these fields (for 'any').# 
# Notice that tasks will be started based on the cron's system
# daemon's notion of time and timezones.
# 
# Output of the crontab jobs (including errors) is sent through
# email to the user the crontab file belongs to (unless redirected).
# 
# For example, you can run a backup of all your user accounts
# at 5 a.m every week with:
# 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/
# 
# For more information see the manual pages of crontab(5) and cron(8)
# 
# m h  dom mon dow   command
59 5 * * * /home/user/stocks_etl.sh >>/home/user/logs/etl_scripts/crontab.log 2>&1

15 9 * * * /home/user/mysql_backup.sh >>/home/user/logs/etl_scripts/mysql_backup.log 2>&1

10 15 * * * /home/user/cleanup_mysql_backups.sh >>/home/user/logs/etl_scripts/cleanup_mysql_backups.log 2>&1

20 11 * * * /home/user/cleanup_downloads.sh >>/home/user/logs/etl_scripts/cleanup_downloads.log 2>&1

cleanup_mysql_backups.sh 脚本：

#!/bin/bash
source activate py36
python /home/user/cleanup_mysql_backups.py

stocks_etl.sh 脚本：

#!/bin/bash
source activate py36
python /home/user/stocks_etl.py

cleanup_mysql_backups.py 脚本：

#!/home/user/anaconda3/envs/py36/bin/python




import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from yahoofinancials import YahooFinancials

import pymysql

import datetime
import logging

import time

import glob

from sqlalchemy import create_engine

import os

import datetime

from datetime import datetime


# In[45]:





# In[47]:


# start logging

# adding a timestamp to logname
# ts=str(datetime.datetime.now().isoformat()) 
ts=str(datetime.now().isoformat())

# logging.basicConfig(filename='example.log',level=logging.DEBUG)
logging.basicConfig(filename='/home/user/logs/etl_scripts/cleanup_mysql_backups_'+ts+'.log', level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)s %(name)s %(message)s')

logger=logging.getLogger(__name__)


path='/home/user/mysql_backups/'

all_files = glob.glob(path + "/*.sql")


# In[48]:


# getting list of backups

delete_list=[]

for p in all_files:
    
    try:
    
        file_date=p.split('/')[-1].split('_')[-1].split('.')[0]

        path_date=datetime.strptime(file_date,"%Y-%m-%d")

        delta=datetime.today()-path_date

        if int(delta.days)>14:

            delete_list.append(p)
            
        logging.info('delete list add backup: '+p)
        
    except Exception as err:
        
        logger.error('delete list add backup failed: '+p)
        
    pass
        
        
        
        


# In[49]:


# deleting backups over 2 weeks old

for f in delete_list:
    
    try:
    
        os.remove(f)
        
        logging.info('deleted backup: '+f)
        
    except Exception as err:
        
        logger.error('delete backup failed: '+f)
        
    pass

stocks_etl.py 脚本：

#!/home/user/anaconda3/envs/py36/bin/python


import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from yahoofinancials import YahooFinancials

import pymysql

import datetime
import logging

import time

import glob

from sqlalchemy import create_engine

import os

import datetime


# helper functions



# function for creating error logs
# Note: function not currently working, doesn't recognize logger

def error_logger(path):
    
    # adding a timestamp to logname
    ts=str(datetime.datetime.now().isoformat())
    
    # logging.basicConfig(filename='example.log',level=logging.DEBUG)
    logging.basicConfig(filename=path+ts+'.log', level=logging.DEBUG, 
                        format='%(asctime)s %(levelname)s %(name)s %(message)s')

    logger=logging.getLogger(__name__)


# function to query mysql db and return dataframe of results
def mysql_query(user,password,database,host,query):
    
    connection = pymysql.connect(user=user, password=password, database=database, host=host)


    try:
        with connection.cursor() as cursor:
            query = query


        df = pd.read_sql(query, connection)
        
        logging.info('query succeeded: '+query)
        
#     finally:
        connection.close()
        
        logging.info('close connection mysql')
        
        return df

    except Exception as err:
        
        logger.error('query failed: '+query+' got error: '+str(err))
        
        
        
    pass

    
        
    


# function to download OHLC stock data

def download_stocks(Ticker_list,start_date,end_date,time_interval,path):
    
    
    # get data for stocks in Ticker_list and save as csv

    failed_list=[]
    passed_list=[]

    Ticker_list = Ticker_list

    for x in range(len(Ticker_list)):


        try:

            yahoo_financials = YahooFinancials(Ticker_list[x])
            # data = yahoo_financials.get_historical_price_data('2019-01-01', '2019-09-30', time_interval='daily')
            data = yahoo_financials.get_historical_price_data(start_date, end_date, time_interval=time_interval)

            prices_df=pd.DataFrame(data[Ticker_list[x]]['prices'])

            prices_df=prices_df[['adjclose', 'close', 'formatted_date', 'high', 'low', 'open',
                   'volume']]

            prices_df['date']=prices_df['formatted_date']

            prices_df=prices_df[['date','adjclose', 'close', 'high', 'low', 'open',
                   'volume']]

            prices_df['Ticker']=Ticker_list[x]

            prices_df.to_csv(path+Ticker_list[x]+'.csv')

            passed_list.append(Ticker_list[x])

            logging.info('downloaded: '+Ticker_list[x])

            time.sleep(1)

        except Exception as err:

            failed_list.append(Ticker_list[x])
            logger.error('tried download: '+Ticker_list[x]+' got error: '+str(err))

        pass
        

# function read csv in and append to one dataframe

def stock_dataframe(path):    

    try:
        path = path
        all_files = glob.glob(path + "/*.csv")

        li = []

        for filename in all_files:
            df = pd.read_csv(filename, index_col=None, header=0)
            li.append(df)

        frame = pd.concat(li, axis=0, ignore_index=True)

        frame=frame[['date', 'adjclose', 'close', 'high', 'low', 'open',
               'volume', 'Ticker']]

        return frame
    
        logging.info('created stock dataframe')
        
    except Exception as err:

            logger.error('stock dataframe create failed got error: '+str(err))
            
    pass


# write dataframe to mysql db

def write_dataframe(username, password, host, schema,dataframe,table,if_exists,index):
    
    try:
        
        from sqlalchemy import create_engine
        
        # connection = pymysql.connect(user='user', password='xxx', database='sandbox', host='xx.xx.xx.xx')

        engine = create_engine("mysql+pymysql://"+str(username)+":"+str(password)+"@"+str(host)+"/"+str(schema))
        # engine = create_engine("mysql+mysqldb://user:"+'xxx'+"@xx.xx.xx.xx/sandbox")
        dataframe.to_sql(con=engine, name=table, if_exists=if_exists, index=index)
        
        logging.info('write_dataframe succeeded')
        
    except Exception as err:

            logger.error('write_dataframe failed got error: '+str(err))
            
    pass




# to do

# - create directory with datetime prefix as part of path
# - add step that checks max date in current table
# - only pull data later than max date in current table
# - check max date in current derived table
# - only pull data later than current date from source table


def etl_pipeline(table_var):


    i=table_var

    max_date_query="""select max(date) as max_date from """+i+""""""

    try:
        
        max_date_df=mysql_query(user='user',
                            password='xxx',
                            database='stocks',
                            host='xx.xx.xx.xx',
                            query=max_date_query)
            
        logging.info('max_date succeeded: '+i)
            
    except Exception as err:

            logger.error('max_date failed: '+i)

    pass
        


    # In[8]:

    try:
        # get max date
        max_date=max_date_df.astype(str)['max_date'][0]


        # create directory

        base_path='/home/user/stock_data_downloads/'

        # get current_date
        current_date=datetime.datetime.today().strftime('%Y-%m-%d')

        directory_path=base_path+i+'/'+current_date

        # create directory for downloading new stocks in to
        os.mkdir(directory_path)

        logging.info('create directory succeeded: '+i)

    except Exception as err:

            logger.error('create directory failed: '+i)

    pass


    # In[9]:


    # getting ticker symbols

    ticker_query="""select distinct ticker as ticker from """+i+""""""

    try:
        
        tickers_df=mysql_query(user='user',
                            password='xxx',
                            database='stocks',
                            host='xx.xx.xx.xx',
                            query=ticker_query)
            
        logging.info('get tickers succeeded: '+i)
            
    except Exception as err:

            logger.error('get tickers failed: '+i)

    pass


    # In[12]:


    # get ticker symbols 
    stocks=tickers_df.ticker.tolist()


    # download stocks
    # Note: must add '/' to end of path
    # '2019-01-01', '2021-01-01', time_interval='daily'
    download_stocks(Ticker_list=stocks,
                    start_date=max_date,
                    end_date=current_date,
                    time_interval='daily',
                    path=directory_path+'/')


    # In[70]:


    # directory_path


    # In[13]:


    # create dataframe
    stocks_df=stock_dataframe(path=directory_path)

    # trav_stocks_df.head()


    # In[14]:





    # create mysql table
    write_dataframe(username='user', 
                    password='xxx', 
                    host='xx.xx.xx.xx', 
                    schema='stocks',
                    dataframe=stocks_df,
                    table=i,
                    if_exists='append',
                    index=False)


    # In[15]:


    # creating additional avg annual returns

    try:
        
        query="""select ticker, avg(annual_returns) as avg_annual_returns from (
        select ticker,date, ( -1 +
                a.adjclose / max(a.adjclose) over (partition by ticker 
                                             order by date
                                             range between interval 365 day preceding and interval 365 day preceding
                                            ) 
               ) as annual_returns              
        from """+i+""" a
        ) b where annual_returns is not null
        group by ticker"""

        df=mysql_query(user='user',password='xxx',database='stocks',host='xx.xx.xx.xx',query=query)

        logging.info('etl succeeded: '+i+'_returns')

    except Exception as err:

            logger.error('etl failed: '+i+'_returns')

    pass


    # In[16]:


    # adding additional avg annual returns to table

    # create mysql table
    write_dataframe(username='user', 
                    password='xxx', 
                    host='xx.xx.xx.xx', 
                    schema='stocks',
                    dataframe=df,
                    table=i+'_returns',
                    if_exists='replace',
                    index=False)
    
    
# start logging

# adding a timestamp to logname
ts=str(datetime.datetime.now().isoformat())  

# logging.basicConfig(filename='example.log',level=logging.DEBUG)
logging.basicConfig(filename='/home/user/logs/etl_scripts/'+ts+'.log', level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)s %(name)s %(message)s')

logger=logging.getLogger(__name__)


    
table_list=['trav_stocks','s_and_p','american_mutual_funds','uranium']

for j in table_list:
    
    try:
        
        etl_pipeline(j)
        
        logging.info('etl_pipeline succeeded: '+j)
        
    except Exception as err:

            logger.error('etl_pipeline failed: '+j)

    pass

Answer 1

更改 .sh 文件如下所示：

#!/bin/bash
/home/user/anaconda3/condabin/conda run -n py36 python /home/user/cleanup_mysql_backups.py

使用 crontab 运行 python 脚本

1 个答案: