PYSPARK - 无法腌制未打开阅读的文件:w

时间:2018-06-06 13:15:48

标签: apache-spark pyspark nltk

当我运行下面的代码时,我收到的错误是因为无法在“text = words.reduce(lambda x,y = x +''+ y)”<< /强>

但是当我单独运行tokenize()函数时,它正在运行。

对不起,这是我第一次发帖问题,有人可以编辑一下以获得更好的观看效果,我尽力了。

 from pyspark.sql.functions import *
 from operator import add
 from nltk.corpus import stopwords
 import os, fnmatch,sys,pickle
 from __future__ import division,unicode_literals
 import math
 from textblob import TextBlob as tb
 import nltk
 from string import punctuation
 from nltk.corpus import stopwords
 from nltk import word_tokenize

 class pre_processing:

    def  __init__(self,name,extension):
       self.name = name
       self.extension= extension
       self.filename = name+"."+extension

   def word_tokenize(textFile):
       x = nltk.word_tokenize(textFile)
       y = [i for i in x if i not in stop_words and not i.isdigit()]
       z = [i for i in y if i not in add_your_own_stop_words]
       return z
   def only_sent(text):
      blob = tb(text)
      sentences = blob.sentences
      only_sentences = '\n'.join(str(v) for v in sentences)
      return only_sentences

   def tokenize(self):
      text = spark.sparkContext.textFile(self.filename)

    only_sentences = text.map(self.only_sent)

    words = text.flatMap(self.word_tokenize)
    #text = ''.join(words)
    text = words.reduce(lambda x,y: x+' '+y)


    blob = tb(text)
    tags = blob.tags
    noun_verb = [word for (word,pos) in tags if pos in ('NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ')]
    input_tfidf = ' '.join(noun_verb)

    noun_phrases = blob.noun_phrases      
    only_noun_phrases = ' '.join(noun_phrases)

    return only_sentences.collect(),only_noun_phrases,input_tfidf

      def create(self):
            write_log("creating 3 new  file")
    try:
        name_output1 = self.name+"_sentences"+"."+self.extension
        name_output2 = self.name+"_noun_phrases"+"."+self.extension
        name_output3 = self.name+"_input_tfidf"+"."+self.extension

        if not os.path.exists('pre_processing'):
            os.makedirs('pre_processing')


        script_path = os.path.abspath('__file__') # i.e. /path/to/dir/foobar.py
        script_dir = os.path.split(script_path)[0] #i.e. /path/to/dir/
        rel_path = "pre_processing/"
        abs_file_path = os.path.join(script_dir, rel_path)

        filepath1 = os.path.join(abs_file_path, name_output1)
        write_log("1. "+filepath1)

        filepath2 = os.path.join(abs_file_path, name_output2)
        write_log("2. "+filepath2)

        filepath3 = os.path.join(abs_file_path, name_output3)
        write_log("3. "+filepath3)

        write_log("Initiating Pre_processing")

        output1,output2,output3 = self.tokenize()

        write_log("pre_processing done")
        write_log("Storing output at %s"%(abs_file_path))

        #output1.saveAsTextFile(filepath1)


        file = open(filepath1, "a")
        file.write(output1)
        file.close()

        file = open(filepath2, "a")
        file.write(output2)
        file.close()

        file = open(filepath3, "a")
        file.write(output3)
        file.close()

        write_log("output stored successfully")
        write_log('\n')


    except:
            write_log("\n")
            write_log("error occured while creating paths")
            write_log("\n")
            write_log("use %tb to see the full traceback.")
            log_file.close()
            sys.exit(0)
     class Just_call:
          def main(self):
             name = input ("enter the file name seperated by comma:")
             extension = input("enter the extension:")
             L= name.split(',')
             newlist=[ii for n,ii in enumerate(L) if ii not in L[:n]]
             no_doc = len(newlist)
             write_log('\n')
             write_log("number of documents you uploaded is %s"%(no_doc))
             for i in newlist:
             call = pre_processing(i,extension)
             call.create()

要调用上述类,请使用以下代码:

log_file = open(“output_log.txt”,“w”)

def write_log(* args):

  line = ''.join([str(a) for a in args])

  log_file.write(line+'\n')

  print(line)

a = Just_call()

a。主()

log_file.close()

   output:(error)
PicklingError                             Traceback (most recent call last)
<ipython-input-55-795f76bdd8b1> in create(self)
     82
---> 83             output1,output2,output3 = self.tokenize()
     84

<ipython-input-55-795f76bdd8b1> in tokenize(self)
     42         #text = ''.join(words)
---> 43         text = words.collect().reduce(lambda x,y: x+' '+y)
     44

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in collect(self)
    823         with SCCallSiteSync(self.context) as css:
--> 824             port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
    825         return list(_load_from_socket(port, self._jrdd_deserializer))

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in _jrdd(self)
   2469         wrapped_func = _wrap_function(self.ctx, self.func, self._prev_jrdd_deserializer,
-> 2470                                       self._jrdd_deserializer, profiler)
   2471         python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func,

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in _wrap_function(sc, func, deserializer, serializer, profiler)
   2402     command = (func, profiler, deserializer, serializer)
-> 2403     pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
   2404     return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec,

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in _prepare_for_python_RDD(sc, command)
   2388     ser = CloudPickleSerializer()
-> 2389     pickled_command = ser.dumps(command)
   2390     if len(pickled_command) > (1 << 20):  # 1M

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/serializers.py in dumps(self, obj)
    567     def dumps(self, obj):
--> 568         return cloudpickle.dumps(obj, 2)
    569

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in dumps(obj, protocol)
    917     cp = CloudPickler(file,protocol)
--> 918     cp.dump(obj)
    919

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in dump(self, obj)
    234         try:
--> 235             return Pickler.dump(self, obj)
    236         except RuntimeError as e:

/usr/lib/python3.5/pickle.py in dump(self, obj)
    407             self.framer.start_framing()
--> 408         self.save(obj)
    409         self.write(STOP)

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    474         if f is not None:
--> 475             f(self, obj) # Call unbound method with explicit self
    476             return

/usr/lib/python3.5/pickle.py in save_tuple(self, obj)
    739         for element in obj:
--> 740             save(element)
    741

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    474         if f is not None:
--> 475             f(self, obj) # Call unbound method with explicit self
    476             return

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function(self, obj, name)
    377             if klass is None or klass is not obj:
--> 378                 self.save_function_tuple(obj)
    379                 return

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function_tuple(self, func)
    528         save(func.__module__)
--> 529         save(closure_values)
    530         write(pickle.TUPLE)

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    474         if f is not None:
--> 475             f(self, obj) # Call unbound method with explicit self
    476             return

/usr/lib/python3.5/pickle.py in save_list(self, obj)
    769         self.memoize(obj)
--> 770         self._batch_appends(obj)
    771

/usr/lib/python3.5/pickle.py in _batch_appends(self, items)
    796             elif n:
--> 797                 save(tmp[0])
    798                 write(APPEND)

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    474         if f is not None:
--> 475             f(self, obj) # Call unbound method with explicit self
    476             return

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_instancemethod(self, obj)
    651             if PY3:
--> 652                 self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj)
    653             else:

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
    785             save(func)
--> 786             save(args)
    787             write(pickle.REDUCE)

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    474         if f is not None:
--> 475             f(self, obj) # Call unbound method with explicit self
    476             return

/usr/lib/python3.5/pickle.py in save_tuple(self, obj)
    724             for element in obj:
--> 725                 save(element)
    726             # Subtle.  Same as in the big comment below.

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    519         # Save the reduce() output and finally memoize the object
--> 520         self.save_reduce(obj=obj, *rv)
    521

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
    779             args = args[1:]
--> 780             save(cls)
    781

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    474         if f is not None:
--> 475             f(self, obj) # Call unbound method with explicit self
    476             return

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_global(self, obj, name, pack)
    638         if typ is not obj and isinstance(obj, (type, types.ClassType)):
--> 639             self.save_dynamic_class(obj)
    640         else:

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_dynamic_class(self, obj)
    475         # encountered while saving will point to the skeleton class.
--> 476         save(clsdict)
    477

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    474         if f is not None:
--> 475             f(self, obj) # Call unbound method with explicit self
    476             return

/usr/lib/python3.5/pickle.py in save_dict(self, obj)
    809         self.memoize(obj)
--> 810         self._batch_setitems(obj.items())
    811

/usr/lib/python3.5/pickle.py in _batch_setitems(self, items)
    835                     save(k)
--> 836                     save(v)
    837                 write(SETITEMS)

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    474         if f is not None:
--> 475             f(self, obj) # Call unbound method with explicit self
    476             return

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function(self, obj, name)
    371                 or themodule is None):
--> 372             self.save_function_tuple(obj)
    373             return

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function_tuple(self, func)
    524         # save the rest of the func data needed by _fill_function
--> 525         save(f_globals)
    526         save(defaults)

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    474         if f is not None:
--> 475             f(self, obj) # Call unbound method with explicit self
    476             return

/usr/lib/python3.5/pickle.py in save_dict(self, obj)
    809         self.memoize(obj)
--> 810         self._batch_setitems(obj.items())
    811

/usr/lib/python3.5/pickle.py in _batch_setitems(self, items)
    835                     save(k)
--> 836                     save(v)
    837                 write(SETITEMS)

/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
    474         if f is not None:
--> 475             f(self, obj) # Call unbound method with explicit self
    476             return

~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_file(self, obj)
    834         if 'r' not in obj.mode and '+' not in obj.mode:
--> 835             raise pickle.PicklingError("Cannot pickle files that are not opened for reading: %s" % obj.mode)
    836

PicklingError: Cannot pickle files that are not opened for reading: w

During handling of the above exception, another exception occurred:

SystemExit                                Traceback (most recent call last)
<ipython-input-55-795f76bdd8b1> in <module>()
    132         print(line)
    133 a = Just_call()
--> 134 a.main()
    135 log_file.close()

<ipython-input-55-795f76bdd8b1> in main(self)
    124         for i in newlist:
    125             call = pre_processing(i,extension)
--> 126             call.create()
    127
    128 log_file = open("output_log.txt","w")

<ipython-input-55-795f76bdd8b1> in create(self)
    111                 write_log("use %tb to see the full traceback.")
    112                 log_file.close()
--> 113                 sys.exit(0)
    114
    115 class Just_call:

SystemExit: 0

0 个答案:

没有答案