当我运行下面的代码时,我收到的错误是因为无法在“text = words.reduce(lambda x,y = x +''+ y)”<< /强>
但是当我单独运行tokenize()函数时,它正在运行。
对不起,这是我第一次发帖问题,有人可以编辑一下以获得更好的观看效果,我尽力了。
from pyspark.sql.functions import *
from operator import add
from nltk.corpus import stopwords
import os, fnmatch,sys,pickle
from __future__ import division,unicode_literals
import math
from textblob import TextBlob as tb
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
class pre_processing:
def __init__(self,name,extension):
self.name = name
self.extension= extension
self.filename = name+"."+extension
def word_tokenize(textFile):
x = nltk.word_tokenize(textFile)
y = [i for i in x if i not in stop_words and not i.isdigit()]
z = [i for i in y if i not in add_your_own_stop_words]
return z
def only_sent(text):
blob = tb(text)
sentences = blob.sentences
only_sentences = '\n'.join(str(v) for v in sentences)
return only_sentences
def tokenize(self):
text = spark.sparkContext.textFile(self.filename)
only_sentences = text.map(self.only_sent)
words = text.flatMap(self.word_tokenize)
#text = ''.join(words)
text = words.reduce(lambda x,y: x+' '+y)
blob = tb(text)
tags = blob.tags
noun_verb = [word for (word,pos) in tags if pos in ('NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ')]
input_tfidf = ' '.join(noun_verb)
noun_phrases = blob.noun_phrases
only_noun_phrases = ' '.join(noun_phrases)
return only_sentences.collect(),only_noun_phrases,input_tfidf
def create(self):
write_log("creating 3 new file")
try:
name_output1 = self.name+"_sentences"+"."+self.extension
name_output2 = self.name+"_noun_phrases"+"."+self.extension
name_output3 = self.name+"_input_tfidf"+"."+self.extension
if not os.path.exists('pre_processing'):
os.makedirs('pre_processing')
script_path = os.path.abspath('__file__') # i.e. /path/to/dir/foobar.py
script_dir = os.path.split(script_path)[0] #i.e. /path/to/dir/
rel_path = "pre_processing/"
abs_file_path = os.path.join(script_dir, rel_path)
filepath1 = os.path.join(abs_file_path, name_output1)
write_log("1. "+filepath1)
filepath2 = os.path.join(abs_file_path, name_output2)
write_log("2. "+filepath2)
filepath3 = os.path.join(abs_file_path, name_output3)
write_log("3. "+filepath3)
write_log("Initiating Pre_processing")
output1,output2,output3 = self.tokenize()
write_log("pre_processing done")
write_log("Storing output at %s"%(abs_file_path))
#output1.saveAsTextFile(filepath1)
file = open(filepath1, "a")
file.write(output1)
file.close()
file = open(filepath2, "a")
file.write(output2)
file.close()
file = open(filepath3, "a")
file.write(output3)
file.close()
write_log("output stored successfully")
write_log('\n')
except:
write_log("\n")
write_log("error occured while creating paths")
write_log("\n")
write_log("use %tb to see the full traceback.")
log_file.close()
sys.exit(0)
class Just_call:
def main(self):
name = input ("enter the file name seperated by comma:")
extension = input("enter the extension:")
L= name.split(',')
newlist=[ii for n,ii in enumerate(L) if ii not in L[:n]]
no_doc = len(newlist)
write_log('\n')
write_log("number of documents you uploaded is %s"%(no_doc))
for i in newlist:
call = pre_processing(i,extension)
call.create()
要调用上述类,请使用以下代码:
log_file = open(“output_log.txt”,“w”)
def write_log(* args):
line = ''.join([str(a) for a in args])
log_file.write(line+'\n')
print(line)
a = Just_call()
a。主()
log_file.close()
output:(error)
PicklingError Traceback (most recent call last)
<ipython-input-55-795f76bdd8b1> in create(self)
82
---> 83 output1,output2,output3 = self.tokenize()
84
<ipython-input-55-795f76bdd8b1> in tokenize(self)
42 #text = ''.join(words)
---> 43 text = words.collect().reduce(lambda x,y: x+' '+y)
44
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in collect(self)
823 with SCCallSiteSync(self.context) as css:
--> 824 port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
825 return list(_load_from_socket(port, self._jrdd_deserializer))
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in _jrdd(self)
2469 wrapped_func = _wrap_function(self.ctx, self.func, self._prev_jrdd_deserializer,
-> 2470 self._jrdd_deserializer, profiler)
2471 python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func,
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in _wrap_function(sc, func, deserializer, serializer, profiler)
2402 command = (func, profiler, deserializer, serializer)
-> 2403 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
2404 return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec,
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/rdd.py in _prepare_for_python_RDD(sc, command)
2388 ser = CloudPickleSerializer()
-> 2389 pickled_command = ser.dumps(command)
2390 if len(pickled_command) > (1 << 20): # 1M
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/serializers.py in dumps(self, obj)
567 def dumps(self, obj):
--> 568 return cloudpickle.dumps(obj, 2)
569
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in dumps(obj, protocol)
917 cp = CloudPickler(file,protocol)
--> 918 cp.dump(obj)
919
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in dump(self, obj)
234 try:
--> 235 return Pickler.dump(self, obj)
236 except RuntimeError as e:
/usr/lib/python3.5/pickle.py in dump(self, obj)
407 self.framer.start_framing()
--> 408 self.save(obj)
409 self.write(STOP)
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
474 if f is not None:
--> 475 f(self, obj) # Call unbound method with explicit self
476 return
/usr/lib/python3.5/pickle.py in save_tuple(self, obj)
739 for element in obj:
--> 740 save(element)
741
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
474 if f is not None:
--> 475 f(self, obj) # Call unbound method with explicit self
476 return
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function(self, obj, name)
377 if klass is None or klass is not obj:
--> 378 self.save_function_tuple(obj)
379 return
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function_tuple(self, func)
528 save(func.__module__)
--> 529 save(closure_values)
530 write(pickle.TUPLE)
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
474 if f is not None:
--> 475 f(self, obj) # Call unbound method with explicit self
476 return
/usr/lib/python3.5/pickle.py in save_list(self, obj)
769 self.memoize(obj)
--> 770 self._batch_appends(obj)
771
/usr/lib/python3.5/pickle.py in _batch_appends(self, items)
796 elif n:
--> 797 save(tmp[0])
798 write(APPEND)
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
474 if f is not None:
--> 475 f(self, obj) # Call unbound method with explicit self
476 return
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_instancemethod(self, obj)
651 if PY3:
--> 652 self.save_reduce(types.MethodType, (obj.__func__, obj.__self__), obj=obj)
653 else:
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
785 save(func)
--> 786 save(args)
787 write(pickle.REDUCE)
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
474 if f is not None:
--> 475 f(self, obj) # Call unbound method with explicit self
476 return
/usr/lib/python3.5/pickle.py in save_tuple(self, obj)
724 for element in obj:
--> 725 save(element)
726 # Subtle. Same as in the big comment below.
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
519 # Save the reduce() output and finally memoize the object
--> 520 self.save_reduce(obj=obj, *rv)
521
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_reduce(self, func, args, state, listitems, dictitems, obj)
779 args = args[1:]
--> 780 save(cls)
781
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
474 if f is not None:
--> 475 f(self, obj) # Call unbound method with explicit self
476 return
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_global(self, obj, name, pack)
638 if typ is not obj and isinstance(obj, (type, types.ClassType)):
--> 639 self.save_dynamic_class(obj)
640 else:
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_dynamic_class(self, obj)
475 # encountered while saving will point to the skeleton class.
--> 476 save(clsdict)
477
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
474 if f is not None:
--> 475 f(self, obj) # Call unbound method with explicit self
476 return
/usr/lib/python3.5/pickle.py in save_dict(self, obj)
809 self.memoize(obj)
--> 810 self._batch_setitems(obj.items())
811
/usr/lib/python3.5/pickle.py in _batch_setitems(self, items)
835 save(k)
--> 836 save(v)
837 write(SETITEMS)
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
474 if f is not None:
--> 475 f(self, obj) # Call unbound method with explicit self
476 return
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function(self, obj, name)
371 or themodule is None):
--> 372 self.save_function_tuple(obj)
373 return
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_function_tuple(self, func)
524 # save the rest of the func data needed by _fill_function
--> 525 save(f_globals)
526 save(defaults)
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
474 if f is not None:
--> 475 f(self, obj) # Call unbound method with explicit self
476 return
/usr/lib/python3.5/pickle.py in save_dict(self, obj)
809 self.memoize(obj)
--> 810 self._batch_setitems(obj.items())
811
/usr/lib/python3.5/pickle.py in _batch_setitems(self, items)
835 save(k)
--> 836 save(v)
837 write(SETITEMS)
/usr/lib/python3.5/pickle.py in save(self, obj, save_persistent_id)
474 if f is not None:
--> 475 f(self, obj) # Call unbound method with explicit self
476 return
~/Downloads/spark-2.3.0-bin-hadoop2.7/python/pyspark/cloudpickle.py in save_file(self, obj)
834 if 'r' not in obj.mode and '+' not in obj.mode:
--> 835 raise pickle.PicklingError("Cannot pickle files that are not opened for reading: %s" % obj.mode)
836
PicklingError: Cannot pickle files that are not opened for reading: w
During handling of the above exception, another exception occurred:
SystemExit Traceback (most recent call last)
<ipython-input-55-795f76bdd8b1> in <module>()
132 print(line)
133 a = Just_call()
--> 134 a.main()
135 log_file.close()
<ipython-input-55-795f76bdd8b1> in main(self)
124 for i in newlist:
125 call = pre_processing(i,extension)
--> 126 call.create()
127
128 log_file = open("output_log.txt","w")
<ipython-input-55-795f76bdd8b1> in create(self)
111 write_log("use %tb to see the full traceback.")
112 log_file.close()
--> 113 sys.exit(0)
114
115 class Just_call:
SystemExit: 0