Pyspark和Stata,失去了变量观察

时间:2018-01-30 17:30:51

标签: pyspark stata

我遇到了一个奇怪的错误。我在独立本地模式下使用PySpark,在每个工作线程中调用Stata。然后每个工作程序将一个dofile写入磁盘,并使用subprocess.call调用Stata,打开运行dofile的Stata实例。然后每个Stata实例都获取一个csv。 问题是,csv每个变量有630个观测值,但Stata只识别621 。当我从常规Stata实例导入csv时,不会发生这种情况。 所有dofiles都有不同的名称,csv只能由工人阅读,从未写过。基本上我所看到的是,当我运行python脚本时,多个Stata窗口打开(如果我设置gui)并开始工作。 到目前为止我尝试过的:

  1. 从cmd.exe (630 obs)
  2. 运行dofile
  3. 从.py运行dofile而没有Spark (630 obs)
  4. 从PySpark (621 obs)
  5. 运行stata
  6. 从Spark工作人员生成的Stata窗口重新运行dofile (621 obs)
  7. 在关闭PySpark发起的一个Stata实例后,从Spark工作人员生成的Stata窗口重新运行dofile (630)
  8. 因此Stata从pyspark调用时只有这个bug,但是当我手动关闭PySpark在一个worker中生成的一个Stata窗口时,bug就消失了。

    这是我用来运行Stata的命令,无论是来自PySpark还是来自cmd。

    'C:\\PROGRA~2\\Stata13\\StataMP-64.exe -e  do c:/repos/myrepo/python\\script_n.do'
    

    我只更改gui标志,n号取决于工人。 这是pyspark脚本中唯一的spark函数:

    with open(stata.working_dir + '/' + stata.csv_filename) as f:
        db = f.readlines()
    db = str(db)
    db_bc = sc.broadcast(db)
    r_squared_rdd = vars_ranges_rdd.flatMap(lambda x: stata.call_stata(db_bc.value,x))
    

    这些是来自csv文件的最后20个条目:

    193,81,.0192107,.390009,.426966,.573684,4.64,.000926,.7859043,17.44954,1,0,0,5.93109,222.9224,329.5925,206.0276,246.3821,208.0373,210.31,375.44,244.42,221.05,.1525252,.0860937,0,0,0,0,0,0,-3.952288,-.9415855,-.8510509,-.5556766,1.534714,-6.984615,-.2409203,2.859313,5.406824,5.797857,5.32801,5.506884,5.337718,5.348583,5.928099,5.498888,5.398389,-1.880425,-2.452319,1.780208,,,,,,,,
    193,82,.0240456,.330775,.468354,.563063,6.65,.0009732,.7912234,19.84998,1,0,0,5.93109,222.8613,353.4454,16.87376,271.7391,203.5503,222.12,385.04,265.34,225.8,.1519293,.0849039,1,0,0,0,0,0,-3.727803,-1.106317,-.7585309,-.5743637,1.894617,-6.93488,-.2341749,2.988203,5.406549,5.867729,2.82576,5.604843,5.315913,5.403218,5.953347,5.581012,5.41965,-1.88434,-2.466235,1.780208,.2244847,-.1647314,.0925199,-.0186871,.3599025,.0497351,.1288893,-.0039152
    193,83,.0220034,.350189,.331897,.448052,7.94,.0009964,.7965425,20.1008,1,0,0,5.93109,232.6596,361.6682,226.0903,293.4129,230.9871,242.12,399.31,265.12,237.99,.0959471,.0836757,0,1,0,0,0,0,-3.816558,-1.049282,-1.102931,-.802846,2.071913,-6.911382,-.2274748,3.00076,5.449576,5.890727,5.420935,5.681581,5.442362,5.489433,5.989738,5.580183,5.472229,-2.343959,-2.480807,1.780208,-.0887551,.0570346,-.3443998,-.2284823,.1772964,.0234981,.0125568,-.4596183
    193,84,.0171735,.342967,.640449,.442982,7.82,.0009761,.8045213,22.50491,1,0,0,5.93109,245.1605,385617,237.3293,324.3165,259.6269,254.5,420.93,268.27,254.88,.0559512,.0824713,0,0,1,0,0,0,-4.064388,-1.070121,-.4455858,-.8142262,2.056684,-6.931904,-.2175079,3.113734,5.501913,5.954844,5.469449,5.78172,5.559246,5.539301,6.042467,5.591994,5.540793,-2.883276,-2.495305,1.780208,-.2478294,-.0208387,.6573448,-.0113801,-.0152287,-.0205226,.1129739,-.5393174
    193,85,.0180945,.356682,.576227,.38565,7.93,.0010006,.8085107,24.77798,1,0,0,5.93109,257.2078,398.3154,256.9071,361.1236,279.1209,263.98,436.96,287.89,291.84,.0742574,.0812092,0,0,0,1,0,0,-4.012147,-1.030911,-.5512536,-.952825,2.070653,-6.907135,-.2125614,3.209955,5.549884,5.987244,5.548715,5.88922,5.631645,5.575873,6.079842,5.662579,5.676206,-2.600217,-2.510727,1.780208,.0522404,.0392104,-.1056677,-.1385989,.0139685,.0247688,.0962219,.2830586
    193,86,.0194505,.341006,.485,.407216,5.98,.0012105,.8071808,25.86566,1,0,0,5.93109,270.2549,443.5995,261.3926,303.6437,280.3852,278.78,453.32,329.18,322.33,.0712329,.0796638,0,0,0,0,1,0,-3.939883,-1.075855,-.7236063,-.8984115,1.788421,-6.716747,-.2142076,3.252916,5.599366,6.094922,5.566024,5.715855,5.636165,5.630423,6.116598,5.796605,5.775576,-2.641801,-2.52994,1.780208,.0722649,-.0449445,-.1723528,.0544135,-.2822324,.1903887,.0429606,-.0415835
    193,87,.0235277,.266055,.588859,.423423,5.86,.0011789,.8138298,28.51783,1,0,0,5.93109,285.8289,480.1948,268.3836,365.0196,295.9352,295.63,468.26,337.88,348.74,.1105016,.0781939,0,0,0,0,0,1,-3.749577,-1.324052,-.5295685,-.8593836,1.76815,-6.743199,-.2060041,3.350529,5.655394,6.174192,5.592417,5.899951,5.69014,5.689109,6.149024,5.822691,5.854327,-2.202725,-2.548563,1.780208,.1903057,-.2481971,.1940379,.0390279,-.0202709,-.0264521,.0976133,.4390755
    195,81,.0631212,.223671,.29148,.45,10.11,.0017569,1.695187,24.62795,0,0,0,37.4311,222.2222,289.8593,192.8789,293.2288,243.7925,290.6,388.05,237.91,233.3,.1227477,.0894729,0,0,0,0,0,0,-2.762699,-1.497579,-1.232784,-.7985078,2.313525,-6.34421,.5277932,3.203882,5.403678,5.669396,5.262063,5.680953,5.496317,5.671948,5.961134,5.471892,5.452325,-2.097624,-2.413819,3.622502,,,,,,,,
    195,82,.0588861,.276907,.242802,.494071,12.57,.001784,1.705882,26.92453,0,0,0,37.4311,240.2337,299.8858,201.3809,299.9741,252.5526,302.15,400.63,255.39,255.62,.1209773,.0879057,1,0,0,0,0,0,-2.83215,-1.284074,-1.415509,-.705076,2.531313,-6.328925,.5340825,3.293038,5.481612,5.703402,5.305198,5.703696,5.53162,5.710924,5.993038,5.542792,5.543692,-2.112152,-2.43149,3.622502,-.0694516,.2135054,-.1827251,.0934317,.217788,.015285,.0891559,-.014528
    195,83,.0573422,.231103,.310668,.449057,9.86,.0017866,1.719251,27.52601,0,0,0,37.4311,254.2064,328992,203.3134,332.2513,262.7839,328.69,419.8,258.78,269.11,.1181818,.0863571,0,1,0,0,0,0,-2.858718,-1.464892,-1.169031,-.8006054,2.288486,-6.327441,.541889,3.315131,5.538146,5.796033,5.314748,5.805892,5.571332,5.795115,6.039778,5.555978,5.59512,-2.135531,-2.449264,3.622502,-.0265682,-.1808182,.2464784,-.0955294,-.2428267,.0014844,.0220933,-.0233791
    195,84,.0551157,.156901,.337522,.494681,11.19,.0019252,1.727273,28.82286,0,0,0,37.4311,264.6621,341.1911,212.6847,348.2738,269.8379,354,444.77,272.71,288.35,.111459,.0848282,0,0,1,0,0,0,-2.898321,-1.85214,-1.086125,-.7038422,2.41502,-6.252741,.5465437,3.361169,5.578454,5.832443,5.359811,5.852989,5.597821,5.869297,6.097557,5.608409,5.664175,-2.194098,-2.467127,3.622502,-.0396023,-.3872484,.082906,.0967633,.1265342,.0746999,.0460374,-.0585675
    195,85,.0236432,.24697,1.56442,.478431,12.49,.0042988,1.721925,36.02775,0,0,0,37.4311,279.4814,340.6775,216.4476,365.8724,278.1635,362.04,459.44,283.4,309.1,.2716763,.0831767,0,0,0,1,0,0,-3.74468,-1.398488,.4475151,-.7372433,2.524928,-5.449429,.5434429,3.584289,5.632936,5.830936,5.377348,5.902285,5.628209,5.891755,6.130008,5.646859,5.733665,-1.303144,-2.486788,3.622502,-.846359,.4536518,1.53364,-.0334011,.1099079,.8033123,.2231207,.8909545
    195,86,.030095,.208738,1.44186,.475806,9.21,.0043097,1.727273,38.39109,0,0,0,37.4311,297.3497,373.8318,224.5291,387.5402,292782,381.86,458.69,307.84,306.63,.1890332,.0813075,0,0,0,0,1,0,-3.503396,-1.566675,.3659339,-.7427451,2.22029,-5.446882,.5465437,3.647825,5.694909,5.923806,5.414005,5.95982,5.679429,5.945054,6.128375,5.72958,5.725642,-1.665833,-2.509517,3.622502,.2412834,-.168187,-.0815812,-.0055018,-.3046384,.0025463,.0635362,-.3626887
    195,87,.0313973,.201397,1.67052,.470588,13.02,.0044592,1.745989,53.66693,0,0,0,37.4311,315.1641,377.9356,246.0614,411433,296.8684,392.27,480.79,303.11,337.28,.1561238,.0794507,0,0,0,0,0,1,-3.461033,-1.602477,.5131349,-.7537723,2.566487,-5.412779,.5573214,3.982797,5.753093,5.934724,5.505581,6.019646,5.693289,5.971951,6.175431,5.714096,5.820913,-1.857106,-2.532619,3.622502,.0423629,-.0358018,.147201,-.0110272,.3461967,.0341029,.3349714,-.1912732
    197,81,.0178621,.158915,1.06098,.356322,11.35,.0008308,.8571429,16.64038,1,0,0,5.46081,183.1502,291.3753,151.3533,228.9377,156.9114,230.08,317.48,260.86,209.1,.0617284,.0806272,0,0,0,0,0,0,-4.025074,-1.839386,.059193,-1.03192,2.429218,-7.093133,-.1541507,2.811832,5.210307,5.674612,5.019617,5.43345,5.055681,5.438427,5.760415,5.563984,5.342813,-2.785011,-2.517919,1.697597,,,,,,,,
    197,82,.0180711,.229167,.545455,.363636,6.64,.0009241,.8660714,17.61957,1,0,0,5.46081,194.2502,323.6862,151.1742,243.4275,171.5078,238.94,330.27,278.94,225.22,.1115789,.0798224,1,0,0,0,0,0,-4.013441,-1.473304,-.606135,-1.011602,1.893112,-6.986701,-.1437879,2.86901,5.269147,5.779775,5.018433,5.494819,5.144629,5.476213,5.799911,5.630997,5.417078,-2.193023,-2.527951,1.697597,.0116329,.3660816,-.665328,.0203185,-.5361059,.1064324,.0571783,.5919883
    197,83,.0155747,.226667,.480392,.428571,7.77,.0010729,.8690476,18.90585,1,0,0,5.46081,207.1006,317.9891,154321,254.8656,196.4637,256.19,352.65,345.27,235.9,.1138614,.0790195,0,1,0,0,0,0,-4.162107,-1.484273,-.7331528,-.8472989,2.05027,-6.837371,-.1403573,2.939471,5.333205,5.762017,5.039035,5.540736,5.280478,5.545919,5.865476,5.844326,5.463408,-2.172773,-2.53806,1.697597,-.1486664,-.010969,-.1270178,.164303,.1571581,.1493297,.0704613,.0202496
    197,84,.0136619,.204188,1.41026,.372727,10.11,.0011087,.8720238,22.70475,1,0,0,5.46081,230.2275,304.8781,170.5955,262.2378,192.6782,268.59,345.9,354.21,246.89,.1169591,.0782327,0,0,1,0,0,0,-4.293144,-1.588714,.3437741,-.986909,2.313525,-6.804576,-.1369385,3.122574,5.439068,5.719912,5.139296,5.569252,5.261022,5.593186,5.84615,5.86989,5.508943,-2.145931,-2.548068,1.697597,-.1310368,-.1044408,1.076927,-.1396101,.2632549,.0327954,.1831028,.0268421
    197,85,.0130857,.180556,.830769,.333333,5.96,.0010541,.875,24.12361,1,0,0,5.46081,253.0364,283.4008,171.6738,271.7391,207.2574,279.17,357.84,354.78,275.01,.0810811,.0772219,0,0,0,1,0,0,-4.336235,-1.711714,-.1854035,-1.098613,1.785071,-6.855049,-.1335314,3.183191,5.533534,5.646862,5.145596,5.604843,5.333961,5.631821,5.880086,5.871498,5.616807,-2.512306,-2.561072,1.697597,-.0430908,-.1230001,-.5291775,-.1117043,-.5284544,-.0504732,.0606167,-.3663745
    197,86,.012874,.112676,2.25,.244444,7.68,.0010879,.8809524,24.98198,1,0,0,5.46081,280555,324.3744,180.0927,312.2946,215.2698,306.09,376.54,355.64,294.49,.0757576,.0757007,0,0,0,0,1,0,-4.352546,-2.183239,.8109302,-1.408769,2.03862,-6.823469,-.1267517,3.218155,5.63677,5.781898,5.193472,5.743947,5.371892,5.723879,5.931024,5.873919,5.685245,-2.580217,-2.580968,1.697597,-.0163107,-.4715245,.9963337,-.3101556,.253549,.03158,.0349636,-.0679111
    197,87,.0141928,.207595,1.18293,.360825,12.23,.0011857,.889881,25.95258,1,0,0,5.46081,314166,341.8803,182802,348.1432,212.8205,322.92,391.72,385.65,306.85,.0675676,.0741989,0,0,0,0,0,1,-4.255021,-1.572166,.1679944,-1.019362,2.503892,-6.737397,-.1166676,3.256271,5.749922,5.834461,5.208404,5.852614,5.360449,5.777405,5.970547,5.95493,5.726359,-2.694627,-2.601006,1.697597,.0975251,.6110725,-.6429358,.3894068,.4652724,.0860724,.0381165,-.1144102
    

    所有dofiles都具有以下结构:

    cd c:/repos/myrepo/python
    import delimited using data.csv
    
     log using 1_1366, text replace 
    
     qui regress county   year , 
    di %23.18f e(r2)
    
     qui regress county   crmrte , 
    di %23.18f e(r2)
    
     qui regress county   year crmrte , 
    di %23.18f e(r2)
    
    ... more regressions ...
    
    
    
    log close
    
    clear
    
    exit
    

    有什么可能导致这种情况的想法吗?

1 个答案:

答案 0 :(得分:1)

答案很简单。由于文件未在flatMap之前关闭,因此OS并未将文件的最后部分控制权交给了Stata,因此它无法读取它,缺少了每个变量的最后观察结果。在启动Stata线程之前(通过运行flatMap)正确关闭文件可以解决此问题。