计算每行pyspark的字符数

时间:2016-08-03 01:17:44

标签: apache-spark pyspark

我可以计算整个文档中每个字符的总数。

我的文件:

ATATCCCCGGGAT
ATCGATCGATAT

计算文档中每个字符的总数:

data=sc.textFile("data.txt")
counts=data.flatMap(lambda x:[(c,1) for c in x]).reduceByKey(add)

结果:

[(u'A', 7), (u'C', 6), (u'T', 7), (u'G', 5)]

我的实施

counts=data.map(lambda x:[(c,1)for c in x])
for row in counts.collect():
    print sc.parallelize(row).reduceByKey(lambda x,y:x+y).collect()

有更好的方法吗?

2 个答案:

答案 0 :(得分:1)

尝试:

 self.loginButton.delegate = self;
        if (FBSDKAccessToken.currentAccessToken() != nil)
        {
            // User is already logged in, do work such as go to next view controller.
        }
        else
        {

            loginButton.center = self.view.center
            loginButton.readPermissions = ["public_profile", "email", "user_friends"]
            loginButton.delegate = self
        }


    //MARK : facebook integration
    func loginButton(loginButton: FBSDKLoginButton!, didCompleteWithResult result: FBSDKLoginManagerLoginResult!, error: NSError!) {

        print(result);
        if ((error) != nil)
        {
            // Process error
            self.showCancelAlert("Error", message: "Error occured")
        }
        else if result.isCancelled {
            // Handle cancellations
        }
        else {
            // If you ask for multiple permissions at once, you
            // should check if specific permissions missing
            if result.grantedPermissions.contains("email")
            {
                // Do work
                self.returnUserData();

            }else{
                self.showCancelAlert("Error", message: "Cannot process with your email address")

            }
        }
    }

    func loginButtonDidLogOut(loginButton: FBSDKLoginButton!) {

    }

    // accessToken is your Facebook id
    func returnUserProfileImage(accessToken: NSString) -> String
    {
        let userID = accessToken as NSString
        let facebookProfileUrl = NSURL(string: "http://graph.facebook.com/\(userID)/picture?type=large")

        if facebookProfileUrl?.absoluteString != nil{
            return (facebookProfileUrl?.absoluteString)!;
        }else{
            return "";
        }

    }




    func returnUserData()
    {
        let graphRequest : FBSDKGraphRequest = FBSDKGraphRequest(graphPath: "me", parameters: nil)
        graphRequest.startWithCompletionHandler({ (connection, result, error) -> Void in

            if ((error) != nil)
            {
                // Process error
                 print("error \(error)")

            }
            else
            {
                let userName : NSString = result.valueForKey("name") as! NSString

                 UserDefaults.sharedInstace.setUsername(userName as String)

                if result.valueForKey("email") != nil{

                    let userEmail : NSString = result.valueForKey("email") as! NSString

                    if let id: NSString = result.valueForKey("id") as? NSString {
                        print("ID is: \(id)")

                        self.showLoading("Signin...")
                        WebClient.sharedInstace.socialLogin(SocialLogin(uid:String(id) , name: userName as String, imageUrl: self.returnUserProfileImage(id), email: userEmail as String, provider: "facebook"))

                    } else {
                        print("ID es null")

                        self.showCancelAlert("Error", message: "Cannot process with your email address")

                        self.loginButtonDidLogOut(self.loginButton)
                    }
                }else{

                    FBSDKLoginManager().logOut()

                    self.loginButtonDidLogOut(self.loginButton)

                    print("email es null")

                    self.showCancelAlert("Error", message: "Cannot process with your email address")

                }

            }
        })
    }

>>> counts.values().sum()
25

答案 1 :(得分:1)

如果你想要的是"用pyspark"计算每一行的字符数。并且每行的每个字符的总数,这将起到作用:

data.map(lambda x:len(x)).zipWithIndex().collect()
>>> [(13, 0), (12, 1)]

如果你想要字符数中的行索引:

def count_occur(str):
   uniq = set(str)
   li = list(str)
   dict = {}
   for key in uniq:
       dict[key] = str.count(key)
   return dict

data.map(lambda x: count_occur(x)).collect()
>>> [{'C': 4, 'T': 3, 'A': 3, 'G': 3}, {'C': 2, 'T': 4, 'A': 4, 'G': 2}]

现在,要计算每行的每个字符的数量,这可能会有所帮助:

zipWithIndex

同样,如果你想要行data.map(lambda x: count_occur(x)).zipWithIndex().collect() >>> [({'C': 4, 'T': 3, 'A': 3, 'G': 3}, 0), ({'C': 2, 'T': 4, 'A': 4, 'G': 2}, 1)] 的索引可以做到这一点:

$path = "c:\fso" 
$xlFixedFormat = "Microsoft.Office.Interop.Excel.xlFixedFormatType" -as [type] 
$excelFiles = Get-ChildItem -Path $path -include *.xls, *.xlsx -recurse 
$objExcel = New-Object -ComObject excel.application 
$objExcel.visible = $false 
foreach($wb in $excelFiles) 
{ 
  $filepath = Join-Path -Path $path -ChildPath ($wb.BaseName + ".pdf") 
  $workbook = $objExcel.workbooks.open($wb.fullname, 3) 
  $workbook.Saved = $true 
  "saving $filepath" 
  $workbook.ExportAsFixedFormat($xlFixedFormat::xlTypePDF, $filepath) 
  $objExcel.Workbooks.close() 
} 
$objExcel.Quit()

希望它有所帮助。