Question

当我运行下面的代码时，我收到的错误是因为无法在“text = words.reduce（lambda x，y = x +''+ y）”<< /强>

但是当我单独运行tokenize（）函数时，它正在运行。

对不起，这是我第一次发帖问题，有人可以编辑一下以获得更好的观看效果，我尽力了。

from pyspark.sql.functions import * from operator import add from nltk.corpus import stopwords import os, fnmatch,sys,pickle from __future__ import division,unicode_literals import math from textblob import TextBlob as tb import nltk from string import punctuation from nltk.corpus import stopwords from nltk import word_tokenize class pre_processing: def __init__(self,name,extension): self.name = name self.extension= extension self.filename = name+"."+extension def word_tokenize(textFile): x = nltk.word_tokenize(textFile) y = [i for i in x if i not in stop_words and not i.isdigit()] z = [i for i in y if i not in add_your_own_stop_words] return z def only_sent(text): blob = tb(text) sentences = blob.sentences only_sentences = '\n'.join(str(v) for v in sentences) return only_sentences def tokenize(self): text = spark.sparkContext.textFile(self.filename) only_sentences = text.map(self.only_sent) words = text.flatMap(self.word_tokenize) #text = ''.join(words) text = words.reduce(lambda x,y: x+' '+y) blob = tb(text) tags = blob.tags noun_verb = [word for (word,pos) in tags if pos in ('NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ')] input_tfidf = ' '.join(noun_verb) noun_phrases = blob.noun_phrases only_noun_phrases = ' '.join(noun_phrases) return only_sentences.collect(),only_noun_phrases,input_tfidf def create(self): write_log("creating 3 new file") try: name_output1 = self.name+"_sentences"+"."+self.extension name_output2 = self.name+"_noun_phrases"+"."+self.extension name_output3 = self.name+"_input_tfidf"+"."+self.extension if not os.path.exists('pre_processing'): os.makedirs('pre_processing') script_path = os.path.abspath('__file__') # i.e. /path/to/dir/foobar.py script_dir = os.path.split(script_path)[0] #i.e. /path/to/dir/ rel_path = "pre_processing/" abs_file_path = os.path.join(script_dir, rel_path) filepath1 = os.path.join(abs_file_path, name_output1) write_log("1. "+filepath1) filepath2 = os.path.join(abs_file_path, name_output2) write_log("2. "+filepath2) filepath3 = os.path.join(abs_file_path, name_output3) write_log("3. "+filepath3) write_log("Initiating Pre_processing") output1,output2,output3 = self.tokenize() write_log("pre_processing done") write_log("Storing output at %s"%(abs_file_path)) #output1.saveAsTextFile(filepath1) file = open(filepath1, "a") file.write(output1) file.close() file = open(filepath2, "a") file.write(output2) file.close() file = open(filepath3, "a") file.write(output3) file.close() write_log("output stored successfully") write_log('\n') except: write_log("\n") write_log("error occured while creating paths") write_log("\n") write_log("use %tb to see the full traceback.") log_file.close() sys.exit(0) class Just_call: def main(self): name = input ("enter the file name seperated by comma:") extension = input("enter the extension:") L= name.split(',') newlist=[ii for n,ii in enumerate(L) if ii not in L[:n]] no_doc = len(newlist) write_log('\n') write_log("number of documents you uploaded is %s"%(no_doc)) for i in newlist: call = pre_processing(i,extension) call.create()

要调用上述类，请使用以下代码：

log_file = open（“output_log.txt”，“w”）

def write_log（* args）：

line = ''.join([str(a) for a in args]) log_file.write(line+'\n') print(line)

a = Just_call（）

a。主（）

log_file.close（）

output:(error) PicklingError Traceback (most recent call last) <ipython-input-55-795f76bdd8b1> in create(self) 82 ---> 83 output1,output2,output3 = self.tokenize() 84 <ipython-input-55-795f76bdd8b1> in tokenize(self) 42 #text = ''.join(words) ---> 43 text = words.collect().reduce(lambda x,y: x+' '+y) 44 ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in collect(self) 823 with SCCallSiteSync(self.context) as css: --> 824 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) 825 return list(_load_from_socket(port, self._jrdd_deserializer)) ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in _jrdd(self) 2469 wrapped_func = _wrap_function(self.ctx, self.func, self._prev_jrdd_deserializer, -> 2470 self._jrdd_deserializer, profiler) 2471 python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func, ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in _wrap_function(sc, func, deserializer, serializer, profiler) 2402 command = (func, profiler, deserializer, serializer) -> 2403 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command) 2404 return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec, ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in _prepare_for_python_RDD(sc, command) 2388 ser = CloudPickleSerializer() -> 2389 pickled_command = ser.dumps(command) 2390 if len(pickled_command) > (1 << 20): # 1M ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/serializers.py in dumps(self, obj) 567 def dumps(self, obj): --> 568 return cloudpickle.dumps(obj, 2) 569 ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in dumps(obj, protocol) 917 cp = CloudPickler(file,protocol) --> 918 cp.dump(obj) 919 ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in dump(self, obj) 234 try: --> 235 return Pickler.dump(self, obj) 236 except RuntimeError as e: /usr/lib/python3.5/pickle.py in dump(self, obj) 407 self.framer.start_framing() --> 408 self.save(obj) 409 self.write(STOP) /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 474 if f is not None: --> 475 f(self, obj) # Call unbound method with explicit self 476 return /usr/lib/python3.5/pickle.py in save_tuple(self, obj) 739 for element in obj: --> 740 save(element) 741 /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 474 if f is not None: --> 475 f(self, obj) # Call unbound method with explicit self 476 return ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function(self, obj, name) 377 if klass is None or klass is not obj: --> 378 self.save_function_tuple(obj) 379 return ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function_tuple(self, func) 528 save(func.__module__) --> 529 save(closure_values) 530 write(pickle.TUPLE) /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 474 if f is not None: --> 475 f(self, obj) # Call unbound method with explicit self 476 return /usr/lib/python3.5/pickle.py in save_list(self, obj) 769 self.memoize(obj) --> 770 self._batch_appends(obj) 771 /usr/lib/python3.5/pickle.py in _batch_appends(self, items) 796 elif n: --> 797 save(tmp[0]) 798 write(APPEND) /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 474 if f is not None: --> 475 f(self, obj) # Call unbound method with explicit self 476 return ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_instancemethod(self, obj) 651 if PY3: --> 652 self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj) 653 else: ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj) 785 save(func) --> 786 save(args) 787 write(pickle.REDUCE) /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 474 if f is not None: --> 475 f(self, obj) # Call unbound method with explicit self 476 return /usr/lib/python3.5/pickle.py in save_tuple(self, obj) 724 for element in obj: --> 725 save(element) 726 # Subtle. Same as in the big comment below. /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 519 # Save the reduce() output and finally memoize the object --> 520 self.save_reduce(obj=obj, *rv) 521 ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj) 779 args = args[1:] --> 780 save(cls) 781 /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 474 if f is not None: --> 475 f(self, obj) # Call unbound method with explicit self 476 return ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_global(self, obj, name, pack) 638 if typ is not obj and isinstance(obj, (type, types.ClassType)): --> 639 self.save_dynamic_class(obj) 640 else: ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_dynamic_class(self, obj) 475 # encountered while saving will point to the skeleton class. --> 476 save(clsdict) 477 /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 474 if f is not None: --> 475 f(self, obj) # Call unbound method with explicit self 476 return /usr/lib/python3.5/pickle.py in save_dict(self, obj) 809 self.memoize(obj) --> 810 self._batch_setitems(obj.items()) 811 /usr/lib/python3.5/pickle.py in _batch_setitems(self, items) 835 save(k) --> 836 save(v) 837 write(SETITEMS) /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 474 if f is not None: --> 475 f(self, obj) # Call unbound method with explicit self 476 return ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function(self, obj, name) 371 or themodule is None): --> 372 self.save_function_tuple(obj) 373 return ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function_tuple(self, func) 524 # save the rest of the func data needed by _fill_function --> 525 save(f_globals) 526 save(defaults) /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 474 if f is not None: --> 475 f(self, obj) # Call unbound method with explicit self 476 return /usr/lib/python3.5/pickle.py in save_dict(self, obj) 809 self.memoize(obj) --> 810 self._batch_setitems(obj.items()) 811 /usr/lib/python3.5/pickle.py in _batch_setitems(self, items) 835 save(k) --> 836 save(v) 837 write(SETITEMS) /usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id) 474 if f is not None: --> 475 f(self, obj) # Call unbound method with explicit self 476 return ~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_file(self, obj) 834 if 'r' not in obj.mode and '+' not in obj.mode: --> 835 raise pickle.PicklingError("Cannot pickle files that are not opened for reading: %s" % obj.mode) 836 PicklingError: Cannot pickle files that are not opened for reading: w During handling of the above exception, another exception occurred: SystemExit Traceback (most recent call last) <ipython-input-55-795f76bdd8b1> in <module>() 132 print(line) 133 a = Just_call() --> 134 a.main() 135 log_file.close() <ipython-input-55-795f76bdd8b1> in main(self) 124 for i in newlist: 125 call = pre_processing(i,extension) --> 126 call.create() 127 128 log_file = open("output_log.txt","w") <ipython-input-55-795f76bdd8b1> in create(self) 111 write_log("use %tb to see the full traceback.") 112 log_file.close() --> 113 sys.exit(0) 114 115 class Just_call: SystemExit: 0

PYSPARK - 无法腌制未打开阅读的文件：w

0 个答案: