
时间:2018-05-09 14:45:25

标签: r dataframe stata text-classification


id       drug        start        stop          dose    unit    route   
2010003  Amlodipine  2009-02-04   2009-11-19    1.5     mg      Oral    
2010003  Amlodipine  2009-11-19   2010-01-11    1.5     mg      Oral      
2010004  Cefprozil   2004-03-12   2004-03-19    175     mg      Oral    
2010004  Clobazam    2002-12-30   2003-01-01    5       mg      Oral

我有一个Stata do文件,它显示了我要做的事情:

replace class = "ACE Inhibitor" if strmatch(upper(drug), "CAPTOPRIL*")
replace class = "ACE Inhibitor" if strmatch(upper(drug), "ENALAPRIL*")
replace class = "ACE Inhibitor" if strmatch(upper(drug), "ENALAPRILAT*")
replace class = "ACE Inhibitor" if strmatch(upper(drug), "FOSINOPRIL*")
replace class = "ACE Inhibitor" if strmatch(upper(drug), "LISINOPRIL*")
replace class = "ACE Inhibitor" if strmatch(upper(drug), "RAMIPRIL*")
replace class = "Acne Medication" if strmatch(upper(drug), "ADAPALENE*")
replace class = "Acne Medication" if strmatch(upper(drug), "ADAPALENE/BENZOYL PEROXIDE*")
replace class = "Acne Medication" if strmatch(upper(drug), "BENZOYL PEROXIDE*")
replace class = "Acne Medication" if strmatch(upper(drug), "BENZOYL PEROXIDE/CLINDAMYCIN*")
replace class = "Acne Medication" if strmatch(upper(drug), "ISOTRETINOIN*")
replace class = "Acne Medication" if strmatch(upper(drug), "ERYTHROMYCIN/TRETINOIN*")
replace class = "Acne Medication/Acute Promyelocytic Leukemia Medication" if strmatch(upper(drug), "TRETINOIN*")
replace class = "Alpha Agonist" if strmatch(upper(drug), "XYLOMETAZOLINE*")
replace class = "Alpha Blocker" if strmatch(upper(drug), "DOXAZOSIN*")
replace class = "Alpha Blocker" if strmatch(upper(drug), "PHENOXYBENZAMINE*")
replace class = "Alpha Blocker" if strmatch(upper(drug), "PHENTOLAMINE*")
replace class = "Alpha Blocker" if strmatch(upper(drug), "PRAZOSIN*")
replace class = "Alpha Blocker" if strmatch(upper(drug), "TAMSULOSIN*")
replace class = "Alpha Blocker" if strmatch(upper(drug), "TERAZOSIN*")
replace class = "Alpha/Beta Blocker" if strmatch(upper(drug), "CARVEDILOL*")
replace class = "Alpha/Beta Blocker" if strmatch(upper(drug), "LABETALOL*")
replace class = "Alpha-1 Agonist" if strmatch(upper(drug), "PHENYLEPHRINE*")
replace class = "Alpha-1 Agonist" if strmatch(upper(drug), "MIDODRINE*")
replace class = "Alpha-2 Agonist" if strmatch(upper(drug), "CLONIDINE*")
replace class = "Alpha-2 Agonist" if strmatch(upper(drug), "DEXMEDETOMIDINE*")
replace class = "Anaesthetic, general" if strmatch(upper(drug), "KETAMINE*")
replace class = "Anaesthetic, general" if strmatch(upper(drug), "THIOPENTAL*")
replace class = "Anaesthetic, local" if strmatch(upper(drug), "BENZOCAINE*")
replace class = "Anaesthetic, local" if strmatch(upper(drug), "BUPIVACAINE*")
replace class = "Anaesthetic, local" if strmatch(upper(drug), "BUPIVACAINE/FENTANYL*")
replace class = "Anaesthetic, local" if strmatch(upper(drug), "TETRACAINE*")
replace class = "Anaesthetic, local" if strmatch(upper(drug), "XYLOCAINE*")
replace class = "Anaesthetic, local/Antiarrythmic" if strmatch(upper(drug), "LIDOCAINE*")
replace class = "Anaesthetic, local/Antiseptic" if strmatch(upper(drug), "HEXYLRESORCINOL*")
replace class = "Anaesthetic, topical" if strmatch(upper(drug), "LIDOCAINE/PRILOCAINE*")
replace class = "Anaesthetic, topical" if strmatch(upper(drug), "PROPARACAINE*")
replace class = "Analgesic" if strmatch(upper(drug), "ACETAMINOPHEN*")
replace class = "Analgesic" if strmatch(upper(drug), "BELLADONNA & OPIUM SUPPOSITORY*")




2 个答案:

答案 0 :(得分:1)


drug_class_data <- read.table("Desktop/stata_script", header=FALSE, sep='"',stringsAsFactors = FALSE)  
drug_class_data <-drug_class_data[,c(2,4)] 
colnames(drug_class_data) <- c('Drug_class','Drug')

删除尾随* - 用作Stata脚本中的通配符

drug_class_data$Drug = gsub("\\*", "", drug_class_data$Drug)

这为您提供了一个包含2列的数据框('Drug_class'&amp;'Drug') - 该行从Stata脚本的每一行中提取引号中的所有数据(以下面的粗体突出显示):


替换class =“ ACE抑制剂”如果strmatch(上层(药物),“ CAPTOPRIL * ”)


write.csv(drug_class_data, file = "drug_class_data.csv",row.names=FALSE)



2)每种药物的单行和每种药物类别的多个布尔列 - “ACE抑制剂”,“痤疮药物”等 - 包含二进制TRUE或FALSE以指示它是否是该类的成员。 / p>

我个人赞成选项2作为下游分析的起点。 (正如你所提到的,药物可能被归类为多个类别,也有几个药物类别呈现等级 - “麻醉,局部”可能是'麻醉,局部/抗心律失常','麻醉,局部/防腐'等的主要术语) / p>


drug_class_list <- unique(drug_class_data[,1])


create_flat_table <- function(df_drugs, df_classes){   
# Extract list of drug classes present in df

class_list <- unique(df_classes[,1])  
# Reiterate over this list creating a new column in the drug df and populating it with data   
drugs <- as.list(drug_data['drug'])  
results <- df_drugs   
for(class in class_list){   
class_drugs <- df_classes[df_classes$Drug_class == class,]   
boolean_list <- toupper(df_drugs[,2])%in%class_drugs[,2]
results <- cbind(results, boolean_list )   }   
colnames(results) <- c(colnames(df_drugs), class_list)   
return(results) }

combined_df <- create_flat_table(drug_data, drug_class_data)


Resulting Dataframe


答案 1 :(得分:1)

假设import sys import gi gi.require_version('Gtk', '3.0') from gi.repository import Gtk, Gio, GObject class Application(Gtk.Application): def __init__(self): app_id = "org.iea.etc" flags = Gio.ApplicationFlags.FLAGS_NONE super(Application, self).__init__(application_id=app_id, flags=flags) def do_activate(self): # c.Controller(m.Model(), v.View(application=self)) Controller(None, View(application=self)) def do_startup(self): Gtk.Application.do_startup(self) class Controller(object): def __init__(self, model, view): self._model = model self._view = view self._view.connect('switch_serial_toggled', self._on_switch_serial_toggled) self._view.show_all() def _on_switch_serial_toggled(self, switch, state): if switch.get_active(): print('Switch ON') else: print('Switch OFF') class View(Gtk.ApplicationWindow): __gsignals__ = { 'switch_serial_toggled': (GObject.SIGNAL_RUN_FIRST, None, ()) } def __init__(self, **kw): super(View, self).__init__(**kw) self._switch_serial = Gtk.Switch() self._switch_serial.connect("notify::active", self.on_switch_serial_toggled) self.add(self._switch_serial) def on_switch_serial_toggled(self, switch, state): self.emit('switch_serial_toggled') if __name__ == '__main__': app = Application() exit_status = app.run(sys.argv) sys.exit(exit_status) statscript在最后的注释中可重复显示。然后使用glob模式DF将类和模式提取到translate并将连接DF提取到它。



translate <- read.table(text = statascript, as.is = TRUE)[c(4, 7)]
names(translate) <- c("class", "pat")

sqldf("select DF.*, translate.class 
       from DF 
       left join translate on upper(class) glob pat")