我已经处理了一个项目,我需要从数据集中的多个行中找到重复的配对。虽然数据集要大得多,但主要部分围绕培训日期,培训地点和培训师姓名。因此,每行数据都有一个日期,一个位置,然后是逗号分隔的名称列表:
Date Location Names
1/13/2014 Seattle A, B, D
1/16/2014 Dallas C, D, E
1/20/2014 New York A, D
1/23/2014 Dallas C, E
1/27/2014 Seattle B, D
1/30/2014 Houston C, A, F
2/3/2014 Washington DC D, A, F
2/6/2014 Phoenix B, E
2/10/2014 Seattle C, B
2/13/2014 Miami A, B, E
2/17/2014 Miami C, D
2/20/2014 New York B, E, F
2/24/2014 Houston A, B, F
我的目标是能够找到具有类似名称配对的行。一个例子是知道A& B在1月13日在西雅图,2月13日在迈阿密和2月24日在休斯顿配对,尽管第三个名字在每次出现时都不同。因此,我不想仅仅在整个名称字符串中查找重复项,而是还希望在“名称”列的部分段之间找到配对。
这可以在Excel中执行,还是需要使用编程语言来完成任务?
虽然我可以手动执行此操作,但它代表了大量可用于其他事情的时间。如果有一种方法我可以自动化它,这将使我的任务的这部分更简单。
提前感谢您提供有关前进道路的任何帮助或建议。
答案 0 :(得分:0)
确定。我感到无聊,并在Python代码中完成了这一切。我假设你熟悉这门语言;但是,您应该能够在安装了Python的任何计算机上使用以下代码。
我做了一些假设。例如,我已将您的示例输入用作确定输入。
会破坏程序的一些事情:
关于计划:
围绕使用人名作为键的字典展开。字典中的值是一个带有元组的集合,其中包含他们在哪个日期的位置。然后通过比较这些集合并获得交集,我们可以找到答案。
有点凌乱,因为我把它当作Python练习。没有在Python中编码一段时间,我在没有使用对象的情况下完成了所有操作。只需遵循"说明"并将存储所有信息的inputfile保存在运行该代码段的同一文件夹中。
作为旁注,您可能需要检查程序是否产生正确的输出。
如果您有任何疑问,请随时与我联系。
def readWord(line, stringIndex):
word = ""
while(line[stringIndex] != " "):
word += line[stringIndex]
stringIndex += 1
return word, stringIndex
def removeSpacing(line, stringIndex):
while(line[stringIndex] == " "):
stringIndex += 1
return stringIndex
def readPeople(line, stringIndex):
lineSize = len(line)
people = []
while(stringIndex < lineSize):
people.append(line[stringIndex])
stringIndex += 3
return people
def readLine(travels, line):
stringIndex = 0
date, stringIndex = readWord(line, stringIndex)
stringIndex = removeSpacing(line, stringIndex)
location, stringIndex = readWord(line, stringIndex)
stringIndex = removeSpacing(line, stringIndex)
people = readPeople(line, stringIndex)
for person in people:
if(person not in travels.keys()):
travels[person] = set()
travels[person].add((date, location))
return travels
def main():
f = open(input("Enter filename (must be in same folder as this program code. For instance, name could be: testDocument.txt\n\n"))
travels = dict()
for line in f:
travels = readLine(travels, line)
print("\n\n\n\n PROGRAM RUNNING \n \n")
while(True):
persons = []
userInput = "empty"
while(userInput):
userInput = input("Enter person name (Type Enter to finish typing names): ")
if(userInput):
persons.append(userInput)
output = travels[persons[0]]
for person in persons[1:]:
output = output.intersection(travels[person])
print("")
for hit in output:
print(hit)
print("\nFINISHED WITH ONE RUN. STARTING NEW ONE\n")
答案 1 :(得分:0)
你可以用VBA做到这一点。以下解决方案假定
例程使用Class模块来收集信息,使用两个Collections来处理数据。它还利用了集合不允许使用相同密钥添加两个项目的功能。
重命名类模块: cPairs
Option Explicit
Private pTrainer1 As String
Private pTrainer2 As String
Private pCity As String
Private pDT As Date
Public Property Get Trainer1() As String
Trainer1 = pTrainer1
End Property
Public Property Let Trainer1(Value As String)
pTrainer1 = Value
End Property
Public Property Get Trainer2() As String
Trainer2 = pTrainer2
End Property
Public Property Let Trainer2(Value As String)
pTrainer2 = Value
End Property
Public Property Get City() As String
City = pCity
End Property
Public Property Let City(Value As String)
pCity = Value
End Property
Public Property Get DT() As Date
DT = pDT
End Property
Public Property Let DT(Value As Date)
pDT = Value
End Property
Option Explicit
Option Compare Text
Public cP As cPairs, colP As Collection
Public colCityPairs As Collection
Public vSrc As Variant
Public vRes() As Variant
Public rRes As Range
Public I As Long, J As Long
Public V As Variant
Public sKey As String
Sub FindPairs()
vSrc = Range("A1", Cells(Rows.Count, "C").End(xlUp))
Set colP = New Collection
Set colCityPairs = New Collection
'Collect Pairs
For I = 2 To UBound(vSrc)
V = Split(Replace(vSrc(I, 3), " ", ""), ",")
If UBound(V) >= 1 Then
'sort the pairs
SingleBubbleSort V
Select Case UBound(V)
Case 1
AddPairs V(0), V(1)
Case 2
AddPairs V(0), V(1)
AddPairs V(0), V(2)
AddPairs V(1), V(2)
End Select
End If
Next I
ReDim vRes(0 To colCityPairs.Count, 1 To 3)
vRes(0, 1) = "Date"
vRes(0, 2) = "Location"
vRes(0, 3) = "Pairs"
For I = 1 To colCityPairs.Count
With colCityPairs(I)
vRes(I, 1) = .DT
vRes(I, 2) = .City
vRes(I, 3) = .Trainer1 & ", " & .Trainer2
End With
Next I
Set rRes = Range("E1").Resize(UBound(vRes, 1) + 1, UBound(vRes, 2))
With rRes
.EntireColumn.Clear
.Value = vRes
With .Rows(1)
.HorizontalAlignment = xlCenter
.Font.Bold = True
End With
.Sort key1:=.Columns(3), order1:=xlAscending, key2:=.Columns(1), order2:=xlAscending, _
Header:=xlYes
.EntireColumn.AutoFit
V = VBA.Array(vbYellow, vbGreen)
J = 0
For I = 2 To rRes.Rows.Count
If rRes(I, 3) = rRes(I - 1, 3) Then
.Rows(I).Interior.Color = .Rows(I - 1).Interior.Color
Else
J = J + 1
.Rows(I).Interior.Color = V(J Mod 2)
End If
Next I
End With
End Sub
Sub AddPairs(T1, T2)
Set cP = New cPairs
With cP
.Trainer1 = T1
.Trainer2 = T2
.City = vSrc(I, 2)
.DT = vSrc(I, 1)
sKey = .Trainer1 & "|" & .Trainer2
On Error Resume Next
colP.Add cP, sKey
If Err.Number = 457 Then
Err.Clear
colCityPairs.Add colP(sKey), sKey & "|" & colP(sKey).DT & "|" & colP(sKey).City
colCityPairs.Add cP, sKey & "|" & .DT & "|" & .City
Else
If Err.Number <> 0 Then Stop
End If
On Error GoTo 0
End With
End Sub
Sub SingleBubbleSort(TempArray As Variant)
'copied directly from support.microsoft.com
Dim Temp As Variant
Dim I As Integer
Dim NoExchanges As Integer
' Loop until no more "exchanges" are made.
Do
NoExchanges = True
' Loop through each element in the array.
For I = LBound(TempArray) To UBound(TempArray) - 1
' If the element is greater than the element
' following it, exchange the two elements.
If TempArray(I) > TempArray(I + 1) Then
NoExchanges = False
Temp = TempArray(I)
TempArray(I) = TempArray(I + 1)
TempArray(I + 1) = Temp
End If
Next I
Loop While Not (NoExchanges)
End Sub