从R中的DF列动态提取XML属性

时间:2018-10-04 10:57:08

标签: r xml dataframe

我有一个包含6列的Log数据框架,其中一个是XML格式的,并且每行都有多个属性,因此我想从该列中动态提取此属性并将其转换为其他5列之后的列列。

这是我df中10行的示例:

DF <- structure(list(ID = c(181941L, 181875L, 181280L, 180518L, 179714L, 
181418L, 179225L, 181237L, 181764L, 182052L), SourceTable = c("Warehouse.WHWorkOrderDetails", 
"Purchase.Documents", "Sales.Documents", "Sales.Documents", "Sales.Documents", 
"Sales.Documents", "Sales.DocumentDetails", "Purchase.Documents", 
"Sales.Documents", "Sales.DocumentDetails"), Action = c("Update", 
"Update", "Update", "Update", "Update", "Update", "Update", "Update", 
"Update", "Update"), ActionDate = structure(c(1538411587.44, 
1538410480.417, 1538399700.747, 1538330212.47, 1538312727.6, 
1538402219.487, 1538303485.363, 1538399265.42, 1538406366.29, 
1538412218.867), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    ActionUserID = c(2100L, 1075L, 1077L, 1069L, 1085L, 1078L, 
    1078L, 2100L, NA, 1078L), RowVersion = c("<Warehouse.WHWorkOrderDetails Id=\"1005849\" WHWorkOrderHeaderId=\"234336\" StockItemId=\"175\" Quantity=\"1.700000000000000e+001\" UnitPrice=\"0.000000000000000e+000\" Value=\"1.000000000000000e+000\" Comments=\"\" LocalOnHandBalance=\"1.866500000000000e+001\" Old_LocalOnHandBalance=\"1.665000000000000e+000\" GlobalOnHandBalance=\"4.944500000000000e+001\" Old_GlobalOnHandBalance=\"3.244500000000000e+001\" ProdDateGlobalOnHandBalance=\"0.000000000000000e+000\" Old_ProdDateGlobalOnHandBalance=\"0.000000000000000e+000\" ProdDateLocalOnHandBalance=\"0.000000000000000e+000\" Old_ProdDateLocalOnHandBalance=\"0.000000000000000e+000\" Price=\"0.000000000000000e+000\" LastCost=\"1.000000000000000e+000\" Old_LastCost=\"2.585000000000000e+001\" AverageCost=\"1.698445692341396e+001\" Old_AverageCost=\"2.535973100872872e+001\" FIFO=\"1.689434725452523e+001\" Old_FIFO=\"2.522240715056249e+001\" LIFO=\"0.000000000000000e+000\" Old_LIFO=\"0.000000000000000e+000\" InsertUserID=\"1071\" InsertDate=\"2018-09-28T11:52:14.577\" ReturnedQuantity=\"0.000000000000000e+000\" RelatedWODetailID=\"0\" FIFO_Detailed=\"[0.0000 * 0.0000][57.5000 * 1.0000][46.0000 * 1.0000][65.0000 * 1.0000][25.0000 * 1.0000][16.5000 * 1.0000][47.0000 * 1.0000][17.0000 * 1.0000]\" DimensionsValue=\"0.000000000000000e+000\" Dimensions=\"0 x 0\" IssuedQuantityFIFO=\"0.000000000000000e+000\" IssuedQuantityLIFO=\"0.000000000000000e+000\" PrevQuantity=\"0.0000\" PrevPrice=\"0.0000\" RelatedPODetailId=\"10032\"/>", 
    "<Purchase.Documents ID=\"468\" DocumentDate=\"2018-10-01T13:28:15\" Serial=\"38\" DocumentType=\"PO\" DocumentCode=\"PO-14-38\" SupplierID=\"1265\" WarehouseID=\"14\" InsertUserID=\"1075\" InsertDate=\"2018-10-01T13:28:15.993\" UpdateUserID=\"1075\" UpdateDate=\"2018-10-01T15:16:03.800\" IsIssued=\"1\" IssueDate=\"2018-10-01T13:28:15.870\" IsCashed=\"0\" IsNPDone=\"0\" PrintPrice=\"0\" NeedInvoice=\"0\" IsByPassed=\"0\" ByPassingReason=\";Has Automatic RS\" IsReviewed=\"0\" Replicated=\"0\" DiscountRate=\"0.0000\" GrossTotal=\"621.0000\" SubTotal=\"621.0000\" Tax=\"0.0000\" NetTotal=\"621.0000\" Discount=\"0.0000\" CashAmount=\"0.0000\" Attachment=\"dbobject/Purchase.Documents[@ID='468']/@Attachment\" AddedTax=\"0.0000\" NeedAddedTax=\"0\" NPAmount=\"0.0000\" OnAccountAmount=\"0.0000\" HasCostAllocation=\"0\" AllocatedCost=\"0.000000000000000e+000\" DistributedCost=\"0.000000000000000e+000\" ExchangeRate=\"1.0000\" TotalItemsDiscounts=\"0.000000000000000e+000\" RequireAutoCash=\"1\" RequireAutoCredit=\"0\" RequireAutoWorkorder=\"1\"/>", 
    "<Sales.Documents ID=\"245181\" DocumentDate=\"2018-09-27T17:52:02.583\" Serial=\"1620\" DocumentType=\"SR\" DocumentCode=\"SR-7-1620\" CustomerID=\"24845\" WarehouseID=\"7\" InsertUserID=\"1078\" InsertDate=\"2018-09-27T17:52:02.590\" UpdateUserID=\"1078\" UpdateDate=\"2018-09-27T17:54:13.367\" IsIssued=\"1\" IssueDate=\"2018-09-27T17:54:13.357\" IsCashed=\"0\" IsNRDone=\"0\" PrintPrice=\"0\" NeedInvoice=\"0\" IsByPassed=\"0\" IsReviewed=\"0\" Replicated=\"0\" DiscountRate=\"0.0000\" GrossTotal=\"10.0400\" SubTotal=\"10.0400\" Tax=\"0.0000\" NetTotal=\"10.0400\" Discount=\"0.0000\" CashAmount=\"10.0400\" HasServicesOnly=\"0\" Attachment=\"\" AddedTax=\"0.0000\" NeedAddedTax=\"0\" CashPaid=\"0.0000\" NRAmount=\"0.0000\" OnAccountAmount=\"0.0000\" ApplyTaxAfterDiscount=\"0\" IsRecurring=\"0\" ExchangeRate=\"1.0000\" OrderStatus=\"Returned\" ShippmentCost=\"0.0000\" TotalItemsDiscounts=\"0.000000000000000e+000\" RequireAutoCash=\"0\" RequireAutoCredit=\"0\" RequireAutoWorkorder=\"0\" IsFromRecurring=\"0\" PaymentStatus=\"Unpaid\" PaidAmount=\"0.0000\"/>", 
    "<Sales.Documents ID=\"248734\" DocumentDate=\"2018-09-30T16:14:37\" Serial=\"44281\" DocumentType=\"SO\" DocumentCode=\"SO-7-44281\" PaperNumber=\"54606\" CustomerID=\"26051\" WarehouseID=\"7\" Notes=\" ????? ??????? ( ?????? ): ????? ????? ???? - ???? ????? ????? ???? - ???? ?????? ????? 3 ???? 47 E        ??????? ??????: ??????? ??????? ?????? ????? .. ??? ???? ???? ? ????? 45 ???? SO-7-44045        ??????? ??????? ??? ???????: ???        ????? ?????: Cash on Delivery        ?? ????? ?????: ??\" InsertUserID=\"1069\" InsertDate=\"2018-09-30T16:14:38.503\" IsIssued=\"0\" IsCashed=\"0\" IsNRDone=\"0\" PrintPrice=\"1\" NeedInvoice=\"0\" IsByPassed=\"0\" IsReviewed=\"0\" Replicated=\"0\" DiscountRate=\"0.0000\" GrossTotal=\"1.0000\" SubTotal=\"1.0000\" Tax=\"0.0000\" NetTotal=\"1.0000\" Discount=\"0.0000\" CashAmount=\"0.0000\" HasServicesOnly=\"0\" ShippingDate=\"2018-09-30T16:14:37\" AddedTax=\"0.0000\" NeedAddedTax=\"0\" CashPaid=\"0.0000\" NRAmount=\"0.0000\" OnAccountAmount=\"1.0000\" ApplyTaxAfterDiscount=\"0\" IsRecurring=\"0\" ExchangeRate=\"1.0000\" OrderStatus=\"Pending\" ShippmentCost=\"0.0000\" TotalItemsDiscounts=\"0.000000000000000e+000\" Channel=\"Call center\" RequireAutoCash=\"0\" RequireAutoCredit=\"0\" RequireAutoWorkorder=\"0\" IsFromRecurring=\"0\" PaymentStatus=\"Unpaid\" PaidAmount=\"0.0000\"/>", 
    "<Sales.Documents ID=\"248673\" DocumentDate=\"2018-09-30T13:03:23.447\" Serial=\"1655\" DocumentType=\"SR\" DocumentCode=\"SR-7-1655\" CustomerID=\"50874\" WarehouseID=\"7\" InsertUserID=\"1085\" InsertDate=\"2018-09-30T13:03:23.450\" UpdateDate=\"2018-09-30T13:03:23.460\" IsIssued=\"0\" IsCashed=\"0\" IsNRDone=\"0\" PrintPrice=\"0\" NeedInvoice=\"0\" IsByPassed=\"0\" IsReviewed=\"0\" Replicated=\"0\" DiscountRate=\"0.0000\" GrossTotal=\"10.0000\" SubTotal=\"10.0000\" Tax=\"0.0000\" NetTotal=\"10.0000\" Discount=\"0.0000\" CashAmount=\"10.0000\" HasServicesOnly=\"0\" AddedTax=\"0.0000\" NeedAddedTax=\"0\" CashPaid=\"0.0000\" NRAmount=\"0.0000\" OnAccountAmount=\"0.0000\" ApplyTaxAfterDiscount=\"0\" IsRecurring=\"0\" ExchangeRate=\"1.0000\" ShippmentCost=\"0.0000\" TotalItemsDiscounts=\"0.000000000000000e+000\" RequireAutoCash=\"0\" RequireAutoCredit=\"0\" RequireAutoWorkorder=\"0\" IsFromRecurring=\"0\" PaymentStatus=\"Unpaid\" PaidAmount=\"0.0000\"/>", 
    "<Sales.Documents ID=\"229764\" DocumentDate=\"2018-09-04T16:07:57\" Serial=\"41223\" DocumentType=\"SO\" DocumentCode=\"SO-7-41223\" PaperNumber=\"50221\" CustomerID=\"24614\" WarehouseID=\"7\" Notes=\" ????? ??????? ( 274 ???????? ??????? ???? ??????? ???? ???? ?? ???? ): ????? 6 ?????? - ??? ????? ???????? ? ??????? ???????? - 274 ???????? ??????? ???? ??????? ???? ???? ?? ????????????? ???2        ??????? ??????? ??? ???????: ???        ????? ?????: Cash on Delivery        ?? ????? ?????: ??\" InsertUserID=\"1069\" InsertDate=\"2018-09-04T16:08:01.130\" UpdateUserID=\"1078\" UpdateDate=\"2018-09-05T10:38:37.343\" IsIssued=\"1\" IssueDate=\"2018-09-05T10:38:35.863\" IsCashed=\"0\" IsNRDone=\"0\" PrintPrice=\"1\" NeedInvoice=\"0\" IsByPassed=\"0\" IsReviewed=\"0\" Replicated=\"0\" DiscountRate=\"0.0000\" GrossTotal=\"117.0000\" SubTotal=\"117.0000\" Tax=\"0.0000\" NetTotal=\"117.0000\" Discount=\"0.0000\" CashAmount=\"0.0000\" HasServicesOnly=\"0\" Attachment=\"\" ShippingDate=\"2018-09-05T16:07:57\" AddedTax=\"0.0000\" NeedAddedTax=\"0\" CashPaid=\"0.0000\" NRAmount=\"0.0000\" OnAccountAmount=\"117.0000\" ApplyTaxAfterDiscount=\"0\" IsRecurring=\"0\" ExchangeRate=\"1.0000\" OrderStatus=\"Shipped\" ShippmentCost=\"0.0000\" TotalItemsDiscounts=\"0.000000000000000e+000\" RequireAutoCash=\"0\" RequireAutoCredit=\"0\" RequireAutoWorkorder=\"0\" IsFromRecurring=\"0\" DueDate=\"2018-09-04T16:07:57\" AnnualPriceRaise=\"0.0000\" PaymentFrequency=\"Monthly\" RaiseAmount=\"0.0000\" PaymentStatus=\"Paid\" PaidAmount=\"117.0000\"/>", 
    "<Sales.DocumentDetails ID=\"876309\" DocumentID=\"247540\" Quantity=\"1.000000000000000e+000\" UnitPrice=\"0.000000000000000e+000\" Price=\"0.000000000000000e+000\" TaxRate=\"0.000000000000000e+000\" UnitDiscount=\"0.000000000000000e+000\" ServiceItemID=\"1\" InsertUserID=\"1069\" InsertDate=\"2018-09-29T17:35:29.167\" UpdateUserID=\"1078\" UpdateDate=\"2018-09-30T08:28:10.480\" RelatedWODetailId=\"0\" DimensionsValue=\"0.000000000000000e+000\" ItemDiscount=\"0.000000000000000e+000\" ItemDiscountType=\"0\" SetupItemDiscount=\"0.000000000000000e+000\" SetupItemDiscountType=\"0\" ItemIsIssued=\"1\" IssuedQuantity=\"1.000000000000000e+000\"/>", 
    "<Purchase.Documents ID=\"461\" DocumentDate=\"2018-09-29T08:41:34\" Serial=\"338\" DocumentType=\"PO\" DocumentCode=\"PO-7-338\" SupplierID=\"1265\" WarehouseID=\"7\" InsertUserID=\"1085\" InsertDate=\"2018-09-29T08:41:34.673\" UpdateUserID=\"2100\" UpdateDate=\"2018-10-01T13:07:45.277\" IsIssued=\"1\" IssueDate=\"2018-09-29T08:41:34.583\" IsCashed=\"0\" IsNPDone=\"0\" PrintPrice=\"0\" NeedInvoice=\"0\" IsByPassed=\"0\" TreasuryID=\"5169\" IsReviewed=\"0\" Replicated=\"0\" DiscountRate=\"0.0000\" GrossTotal=\"1330.8730\" SubTotal=\"1330.8730\" Tax=\"0.0000\" NetTotal=\"1330.8730\" Discount=\"0.0000\" CashAmount=\"0.0000\" Attachment=\"dbobject/Purchase.Documents[@ID='461']/@Attachment\" AddedTax=\"0.0000\" NeedAddedTax=\"0\" NPAmount=\"0.0000\" OnAccountAmount=\"0.0000\" HasCostAllocation=\"0\" AllocatedCost=\"0.000000000000000e+000\" DistributedCost=\"0.000000000000000e+000\" ExchangeRate=\"1.0000\" TotalItemsDiscounts=\"0.000000000000000e+000\" RequireAutoCash=\"1\" RequireAutoCredit=\"0\" RequireAutoWorkorder=\"1\"/>", 
    "<Sales.Documents ID=\"248908\" DocumentDate=\"2018-10-01T15:06:05.600\" Serial=\"18896\" DocumentType=\"SO\" DocumentCode=\"SO-14-18896\" CustomerID=\"42042\" WarehouseID=\"14\" InsertUserID=\"1079\" InsertDate=\"2018-10-01T15:06:05.617\" UpdateUserID=\"1079\" UpdateDate=\"2018-10-01T15:06:05.637\" IsIssued=\"1\" IssueDate=\"2018-10-01T15:06:05.600\" IsCashed=\"1\" IsNRDone=\"0\" PrintPrice=\"0\" NeedInvoice=\"0\" IsByPassed=\"0\" ByPassingReason=\";Has Automatic CI\" TreasuryID=\"5121\" IsReviewed=\"0\" Replicated=\"0\" DiscountRate=\"0.0000\" GrossTotal=\"10.9500\" SubTotal=\"10.9500\" Tax=\"0.0000\" NetTotal=\"10.9500\" Discount=\"0.0000\" CashAmount=\"10.9500\" HasServicesOnly=\"0\" Attachment=\"\" AddedTax=\"0.0000\" NeedAddedTax=\"0\" CashPaid=\"0.0000\" NRAmount=\"0.0000\" OnAccountAmount=\"0.0000\" ApplyTaxAfterDiscount=\"0\" IsRecurring=\"0\" ExchangeRate=\"1.0000\" OrderStatus=\"Shipped\" ShippmentCost=\"0.0000\" TotalItemsDiscounts=\"0.000000000000000e+000\" RequireAutoCash=\"1\" RequireAutoCredit=\"1\" RequireAutoWorkorder=\"1\" IsFromRecurring=\"0\" DueDate=\"2018-10-01T15:05:40\" AnnualPriceRaise=\"0.0000\" PaymentFrequency=\"Monthly\" RaiseAmount=\"0.0000\" PaymentStatus=\"Paid\" PaidAmount=\"10.9500\"/>", 
    "<Sales.DocumentDetails ID=\"879999\" DocumentID=\"248829\" Quantity=\"5.000000000000000e-001\" UnitPrice=\"0.000000000000000e+000\" Price=\"3.500000000000000e+001\" TaxRate=\"0.000000000000000e+000\" UnitDiscount=\"0.000000000000000e+000\" StockItemID=\"175\" Comments=\"????: 0.5 ????\" InsertUserID=\"1069\" InsertDate=\"2018-10-01T10:10:44.603\" RelatedWODetailId=\"0\" DimensionsValue=\"0.000000000000000e+000\" ItemDiscount=\"0.000000000000000e+000\" ItemDiscountType=\"1\" SetupItemDiscount=\"0.000000000000000e+000\" SetupItemDiscountType=\"0\" ItemIsIssued=\"0\" IssuedQuantity=\"0.000000000000000e+000\"/>"
    )), row.names = c(2787L, 2721L, 2132L, 1378L, 691L, 2268L, 
202L, 2089L, 2614L, 2898L), class = "data.frame") 

1 个答案:

答案 0 :(得分:1)

尝试此代码。首先,我将所有属性提取为列表,然后将该列表转换为data.frame格式。当特定行的属性缺失时,我保留NA值。

## extract all attributes
result <- lapply(DF$RowVersion , function(x) {
  xml <- read_xml(x)
  xml_attrs(xml)
})

## get all names of attributes
allNames <- unique(unlist(lapply(result, names)))

# create placeholder with all NAs
placeholder <- rep(NA, length(allNames))
names(placeholder) <- allNames

matrixRes <- t(sapply(result, function(x) {
  # fill values with existing attributes
  tmp <- placeholder
  tmp[names(x)] <- x
  tmp  
}))

library(dplyr)
library(readr)
# parse data to the proper formats:
dataFrameRes <- as.data.frame(matrixRes) %>% mutate_all(parse_guess)

bind_cols(DF, dataFrameRes)