合并两个表,其中一列是R中另一列的子字符串

时间:2019-02-11 18:49:52

标签: r merge fuzzyjoin

我有两个data.frame,其列包含登录号

df 1的子集:

for i in user_list:
    username = i
    for e in email_ids:
        em = e.split("@")[0]
        if username == em:
           send_email(i,em)  """ I need to select the email based on the condition to send email only to the match i.e ravi.teja == ravi.teja send email to ravi.teja@gmail.com or teja.ravi == teja.ravi send email to teja.ravi@gmail.com

can some one help me how to select an email_address if the firstname and the last name in the email id matches the exact string."""

df 2的子集:

var takenMap: MKMapView!

@IBAction func postPressed(_ sender: Any) {
    if textView.text != "" && takenImage != nil && userLocation.text != "" {
        // Create and save a new job
        let newJob = Job(text: textView.text, jobImage: takenImage!, addedByUser: (userLabel?.text)!, userImage: UserImage, location: userLocation.text, map: takenMap.MKMapView)
        newJob.save()
}

//MARK:- CLLocationManager Delegates
func locationManager(_ manager: CLLocationManager, didUpdateLocations locations: [CLLocation]) {
    if let lastLocation = locations.last {
        let geoCoder = CLGeocoder()

        let center = CLLocationCoordinate2D(latitude: lastLocation.coordinate.latitude, longitude: lastLocation.coordinate.longitude)
        let region = MKCoordinateRegion(center: center, span: MKCoordinateSpan(latitudeDelta: 0.01, longitudeDelta: 0.01))
        map.setRegion(region, animated: true)
        self.map = takenMap

        geoCoder.reverseGeocodeLocation(lastLocation) { (placeMarks, error) in
            if error == nil {
                if let firstLocation = placeMarks?[0] {
                    self.locationManager.stopUpdatingLocation()

                    if let cityName = firstLocation.locality,
                        let street = firstLocation.thoroughfare {

                        self.scanLocation = "\(street), \(cityName)"
                        print("This is the current city name", cityName)
                        print("this is the current street address", street)
                        self.takenLocation = self.scanLocation!
                        self.userLocation.text = self.takenLocation
                    }
                }
            }
        }
    }
}

我想按列var map: String? init(map: String? = nil) { self.map = map ref = Database.database().reference().child("jobs").childByAutoId() } init(snapshot: DataSnapshot){ ref = snapshot.ref if let value = snapshot.value as? [String : Any] { map = value["location"] as? String } } func save() { let newPostKey = ref.key // save jobImage if let imageData = jobImage?.jpegData(compressionQuality: 0.5) { let storage = Storage.storage().reference().child("jobImages/\(newPostKey)") storage.putData(imageData).observe(.success, handler: { (snapshot) in self.downloadURL = snapshot.metadata?.downloadURL()?.absoluteString let postDictionary = [ "map" : self.map! ] as [String : Any] self.ref.setValue(postDictionary) }) } } 合并两个数据帧,但是问题是它们不是完全匹配。 sub_df1 <- structure(list(database = "CLO, ArrayExpress, ArrayExpress, ATCC, BCRJ, BioSample, CCLE, ChEMBL-Cells, ChEMBL-Targets, Cosmic, Cosmic, Cosmic, Cosmic-CLP, GDSC, GEO, GEO, GEO, IGRhCellID, LINCS_LDP, Wikidata", database_accession = "CLO_0009006, E-MTAB-2770, E-MTAB-3610, CRL-7724, 0337, SAMN03471142, SH4_SKIN, CHEMBL3308177, CHEMBL2366309, 687440, 909713, 2159447, 909713, 909713, GSM887568, GSM888651, GSM1670420, SH4, LCL-1280, Q54953204"), .Names = c("database", "database_accession"), row.names = 2L, class = "data.frame") 中的字符串是sub_df2 <- structure(list(database_accession = "SH4_SKIN", G1 = -1.907138, G2 = -7.617305, G3 = -3.750553, G4 = 2.615004, G5 = 9.751557), .Names = c("database_accession", "G1", "G2", "G3", "G4", "G5"), row.names = 101L, class = "data.frame") 中字符串的子字符串。

我曾考虑过使用fuzzyjoin,但很难正确设置匹配算法。

2 个答案:

答案 0 :(得分:1)

您可以使用sqldf包并编写一个查询,将带有like条件的表连接起来,以测试sub_df1中的值是否包含sub_df2中的值。 / p>

library(sqldf)
sqldf('
select  *
from    sub_df2 two
        left join sub_df1 one
          on one.database_accession like "%" || two.database_accession || "%"
')

答案 1 :(得分:1)

使用match_fun = str_detect或regex_join()的fuzzyjoin解决方案:

library(tidyverse); library(fuzzyjoin)
# Load data
sub_df1 <- structure(list(database = "CLO, ArrayExpress, ArrayExpress, ATCC, BCRJ, BioSample, CCLE, ChEMBL-Cells, ChEMBL-Targets, Cosmic, Cosmic, Cosmic, Cosmic-CLP, GDSC, GEO, GEO, GEO, IGRhCellID, LINCS_LDP, Wikidata", database_accession = "CLO_0009006, E-MTAB-2770, E-MTAB-3610, CRL-7724, 0337, SAMN03471142, SH4_SKIN, CHEMBL3308177, CHEMBL2366309, 687440, 909713, 2159447, 909713, 909713, GSM887568, GSM888651, GSM1670420, SH4, LCL-1280, Q54953204"), .Names = c("database", "database_accession"), row.names = 2L, class = "data.frame")
sub_df2 <- structure(list(database_accession = "SH4_SKIN", G1 = -1.907138, G2 = -7.617305, G3 = -3.750553, G4 = 2.615004, G5 = 9.751557), .Names = c("database_accession", "G1", "G2", "G3", "G4", "G5"), row.names = 101L, class = "data.frame")

# Solution
# Using fuzzy_join. Could also use regex_full_join(), which is the wrapper for match_fun = str_detect, mode = "full"
fuzzy_join(sub_df1, sub_df2, match_fun = str_detect, by = "database_accession", mode = "full") %>% 
  str()
#> 'data.frame':    1 obs. of  8 variables:
#>  $ database            : chr "CLO, ArrayExpress, ArrayExpress, ATCC, BCRJ, BioSample, CCLE, ChEMBL-Cells, ChEMBL-Targets, Cosmic, Cosmic, Cos"| __truncated__
#>  $ database_accession.x: chr "CLO_0009006, E-MTAB-2770, E-MTAB-3610, CRL-7724, 0337, SAMN03471142, SH4_SKIN, CHEMBL3308177, CHEMBL2366309, 68"| __truncated__
#>  $ database_accession.y: chr "SH4_SKIN"
#>  $ G1                  : num -1.91
#>  $ G2                  : num -7.62
#>  $ G3                  : num -3.75
#>  $ G4                  : num 2.62
#>  $ G5                  : num 9.75

reprex package(v0.2.1)于2019-03-17创建