Pyspark数据框到Pandas数据框

时间:2018-09-10 12:31:19

标签: pandas dataframe pyspark

正在获取以下内容

  

将pyspark数据框转换为Pandas数据框时出错

代码:

some_df = sc.parallelize([
 ("A", "no"),
 ("B", "yes"),
 ("B", "yes"),
 ("B", "no")]
 ).toDF(["user_id", "phone_number"])

pandas_df = some_df.toPandas()

错误: Py4JJavaError:调用o104.collectToPython时发生错误。 enter image description here

1 个答案:

答案 0 :(得分:0)

在我检查的系统中它运行良好,当Spark希望将所有数据加载到驱动程序内存中时,会出现此错误,因此可能您没有足够的内存来增加驱动程序内存以解决问题或使用以下方法清除垃圾#---------- # import #---------- import matplotlib.animation as animation from matplotlib.figure import Figure from matplotlib import style from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg from matplotlib.backends.backend_tkagg import NavigationToolbar2Tk import tkinter as tk from tkinter import ttk import sqlite3 import time import random style.use('ggplot') LARGE_FONT= ("Verdana", 14) f = Figure(figsize=(10,5), dpi=100) ax = f.add_subplot(111) writer = sqlite3.connect('./tmp.db') cw = writer.cursor() cw.execute("PRAGMA journal_mode=WAL;") reader = sqlite3.connect('./tmp.db') cr = reader.cursor() cr.execute("PRAGMA journal_mode=WAL;") cw.execute("CREATE TABLE IF NOT EXISTS 'tmpTable' ('time' INTERER, 'val1' REAL, 'val2' REAL)") cw.execute("DELETE FROM 'tmpTable'") def write_to_db(): for i in range(10): unix = int(time.time()) val1 = '{:0.3f}'.format(random.random()) val2 = '{:0.3f}'.format(random.random()) cw.execute("INSERT INTO 'tmpTable' VALUES (?, ?, ?)", (unix, val1, val2)) writer.commit() time.sleep(1) def animate(i): cr.execute("SELECT * FROM 'tmpTable'") data = cr.fetchall() xList = [] yList1 = [] yList2 = [] for line in data: xList.append(line[0]) yList1.append(line[1]) yList2.append(line[2]) xList[:] = [x-xList[0] for x in xList] print(xList) print(yList1) print(yList2) ax.clear() ax.plot(xList, yList1) ax.plot(xList, yList2) class MultiPage(tk.Tk): def __init__(self, *args,**kwargs): tk.Tk.__init__(self, *args, **kwargs) tk.Tk.wm_title(self, 'Multithreading Test') container = tk.Frame(self) container.pack(side='top', fill='both', expand=True) container.grid_rowconfigure(0, weight=1) container.grid_columnconfigure(0, weight=1) self.frames = {} for F in (StartPage, GraphPage): frame = F(container, self) self.frames[F] = frame frame.grid(row=0, column=0, sticky="nsew") self.show_frame(StartPage) def show_frame(self, cont): frame = self.frames[cont] frame.tkraise() class StartPage(tk.Frame): def __init__(self, parent, controller): tk.Frame.__init__(self,parent) label = ttk.Label(self, text="Start Page", font=LARGE_FONT) label.pack(pady=10,padx=10) button1 = ttk.Button(self, text="Go to Graph Page", command=lambda: controller.show_frame(GraphPage)) button1.pack() class GraphPage(tk.Frame): def __init__(self, parent, controller): tk.Frame.__init__(self, parent) label = ttk.Label(self, text="Graph Page", font=LARGE_FONT) label.pack(pady=10,padx=10) button1 = ttk.Button(self, text="Back to Home", command=lambda: controller.show_frame(StartPage)) button1.pack() canvas = FigureCanvasTkAgg(f, self) canvas.draw() canvas.get_tk_widget().pack(side=tk.BOTTOM, fill=tk.BOTH, expand=True) toolbar = NavigationToolbar2Tk(canvas, self) toolbar.update() canvas._tkcanvas.pack(side=tk.TOP, fill=tk.BOTH, expand=True) #---------- # Main app #---------- if __name__ == "__main__": app = MultiPage() write_to_db() ani = animation.FuncAnimation(f, animate, interval=1000) app.mainloop() cw.close() cr.close() writer.close() reader.close() 让我知道是否有帮助。