Question

我正在尝试模仿本演练here，以便建立将sklearn模型应用于我拥有的数据的管道。我的命令行输入和随后的错误如下：

(venv) computer:predictions uswygst$ python predictions.py   \
--runner DataflowRunner \
--project my_project    \
--requirements_file "requirements.txt"   \
--temp_location gs://my_bucket/template/   \
--worker_machine_type n1-standard-8  \
--num_workers 5

predictions.py:57: BeamDeprecationWarning: parse_table_schema_from_json is deprecated since 2.11.0. Use bigquery_tools.parse_table_schema_from_json instead.
  { 'name': 'title', 'type': 'STRING'}]}))
/opt/anaconda3/envs/venv/lib/python3.7/site-packages/apache_beam/io/gcp/bigquery.py:1479: BeamDeprecationWarning: options is deprecated since First stable release. References to <pipeline>.options will not be supported
  experiments = p.options.view_as(DebugOptions).experiments or []
/opt/anaconda3/envs/venv/lib/python3.7/site-packages/apache_beam/runners/dataflow/ptransform_overrides.py:315: BeamDeprecationWarning: BigQuerySink is deprecated since 2.11.0. Use WriteToBigQuery instead.
  kms_key=self.kms_key))
You are using pip version 9.0.3, however version 20.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
You are using pip version 9.0.3, however version 20.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
WARNING:root:Make sure that locally built Python SDK docker image has Python 3.7 interpreter.
Traceback (most recent call last):
  File "predictions.py", line 75, in <module>
    result.wait_until_finish()
  File "/opt/anaconda3/envs/venv/lib/python3.7/site-packages/apache_beam/runners/dataflow/dataflow_runner.py", line 1629, in wait_until_finish
    self)
apache_beam.runners.dataflow.dataflow_runner.DataflowRuntimeException: Dataflow pipeline failed. State: FAILED, Error:
A setup error was detected in beamapp-uswygst-091501355-09141837-d5ts-harness-qdhg. Please refer to the worker-startup log for detailed information.

当我查看工作日志时，会看到：

Failed to install packages: failed to install requirements: exit status 1

我提交的单个python文件的代码是

import apache_beam as beam
import argparse
from google.cloud import storage
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.io.gcp.bigquery import parse_table_schema_from_json
import pandas as pd
import pickle as pkl
import joblib
import json

query = """
    SELECT index, product_title
    FROM `my_project.my_dataset.my_table`
"""

class ApplyDoFn(beam.DoFn):
    def __init__(self):
        self._model = None
        self._textExtraction = None
        self._translationDictionary = None
        self._storage = storage
        self._pkl = pkl
        self._pd = pd
        self._joblib = joblib

    def process(self, element):
        if self._textExtraction is None:
            bucket = self._storage.Client().get_bucket(
                'marketing-analytics-data')
            blob = bucket.get_blob('tfidfit')
            self._textExtraction = pkl.loads(blob.download_as_string(), encoding='latin-1')
        if self._translationDictionary is None:
            bucket = self._storage.Client().get_bucket(
                'marketing-analytics-data')
            blob = bucket.get_blob('id_to_category')
            self._translationDictionary = self._pkl.loads(blob.download_as_string())
        if self._model is None:
            bucket = self._storage.Client().get_bucket(
                'marketing-analytics-data')
            blob = bucket.get_blob('model.joblib')
            model_local='local_model'
            blob.download_to_filename(model_local)
            #load that file from local file
            self._model=joblib.load(model_local)

        new_x = self._pd.DataFrame.from_dict(element,
                                             orient="index").transpose().fillna(0)
        =id = self._model.predict(self._textExtraction.transform(new_x.iloc[:, 1]).toarray()).tolist()[0]

        return [{'index': element['index'], 'product_title':element['title'], 'title': self._translationDictionary[id]}]

schema = parse_table_schema_from_json(json.dumps({'fields':
            [ { 'name': 'index', 'type': 'INTEGER'},
              { 'name': 'product_title', 'type': 'STRING'},
              { 'name': 'title', 'type': 'STRING'}]}))

parser = argparse.ArgumentParser()
known_args, pipeline_args = parser.parse_known_args(None)
pipeline_options = PipelineOptions(pipeline_args)

# define the pipeline steps
p = beam.Pipeline(options=pipeline_options)
data = p | 'Read from BigQuery' >> beam.io.Read(
       beam.io.BigQuerySource(query=query, use_standard_sql=True))
scored = data | 'Apply Model' >> beam.ParDo(ApplyDoFn())
scored | 'Save to BigQuery' >> beam.io.Write(beam.io.gcp.bigquery.WriteToBigQuery(
                'my_table', 'my_dataset', 'my_project', schema = schema,
   create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
   write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

# run the pipeline
result = p.run()
result.wait_until_finish()

我的要求是。

google-cloud-storage==1.19.0
scikit-learn==0.23.1

有什么办法解决这个问题吗？

Answer 1

正如@Enrique Zetina所指出的那样，存在并详细描述了解决方案here。

但是，这引起了另一个关于程序包名称和Dataflow工作程序的错误。要解决此问题，请在函数内部导入软件包：

   def run():
       import apache_beam as beam
       import argparse
       from google.cloud import storage
       from apache_beam.options.pipeline_options import PipelineOptions
       from apache_beam.options.pipeline_options import SetupOptions
       from apache_beam.io.gcp.bigquery import parse_table_schema_from_json
       import pandas as pd
       import pickle as pkl
       import joblib
       import json
    ... 

    if __name__ == '__main__':
    run()

如果数据集很大，您将希望使用参数max_num_workers而不是num_workers。

Dataflow Worker：无法安装软件包：无法安装要求：退出状态1

1 个答案: