我目前正在VB.Net控制台应用程序中使用以下代码,该应用程序获取文本文件的内容并提取某些信息,然后将其导出为CSV。
所有似乎都运行良好,但问题是文件最初是以PDF形式出现的(只有选项),我必须在Adobe手动打开文件并“另存为文本”。
是否有办法自动将PDF转换为文本文件或读取PDF代替文本文件。
任何指导或选择都将不胜感激
Dim iLine, iEnd, c, iField As Integer
Dim iSecs, iMax As Long
Dim sText, sTemp, sSchema As String
Dim sHotel, sEndDate, sMon, sPLU, sTots, sValue, sDept, sFile, sOutFile, sDesc As String
Dim tdate As Date
Dim con As New OleDbConnection("Provider=Microsoft.ACE.OLEDB.12.0; Data Source=C:\temp\TX.accdb;")
Dim LUse As Boolean
sHotel = "Unknown Hotel"
sEndDate = "01/01/2015"
sMon = "MAR"
sPLU = ""
sTots = "0"
sValue = "0"
sDept = "Unknown Dept"
sDesc = ""
LUse = True
sTemp = ""
iField = 0
sSchema = "Chester"
'Open input file
sFile = "c:\temp\input.txt"
Dim InFile As New System.IO.StreamReader(sFile)
'Open lookup data table
con.Open()
Dim dbAdapter As OleDbDataAdapter = New OleDbDataAdapter( _
"SELECT * FROM Plookup", con)
Dim dsTX As DataSet = New DataSet()
Dim changes As DataTable
Dim cmdbuilder As OleDbCommandBuilder = New OleDbCommandBuilder(dbAdapter)
dbAdapter.FillSchema(dsTX, SchemaType.Source, "Plookup")
dbAdapter.Fill(dsTX, "Plookup")
Dim rstx As DataTable = dsTX.Tables(0)
iMax = rstx.Rows.Count
Dim productrow() As Data.DataRow
'Open Output file
iSecs = Timer
sOutFile = "c:\temp\TX" & Format$(Now, "yymmdd") & Trim$(Str$(iSecs)) & ".csv"
FileCopy(sFile, "c:\temp\TX" & Format$(Now, "yymmdd") & Trim$(Str$(iSecs)) & ".txt")
Dim OutFile As New System.IO.StreamWriter(sOutFile)
'Write header
OutFile.WriteLine("outlet,dept,epos,tots sold,total price,date of sales")
iLine = 0
Do While InFile.Peek() <> -1
'Read in text
iLine = iLine + 1
sText = InFile.ReadLine
sText = sText.Replace(",", "")
If Len(sText) > 2 And Len(sText) < 9 Then
If Mid$(sText, 3, 1) = "-" Then ' Department Name
sText = sText & Space(9 - Len(sText))
End If
End If
'Process all rows except header row - read data into array
If Len(sText) > 8 Then
Select Case Left(sText, 7)
Case "Consoli" ' Ignore
Case "Quanti " ' Ignore
Case "Group b" ' Ignore - but next row is the Hotel Name
iLine = iLine + 1
sText = InFile.ReadLine
sText = sText.Replace(",", "")
sHotel = Trim$(Left(sText, 20)) 'The username follows so we may truncate the hotel name
Case "Date ra" ' End date
sEndDate = Mid$(sText, 29, 2) & "/" & Mid$(sText, 32, 2) & "/" & Mid$(sText, 35, 4)
tdate = CDate(sEndDate).AddDays(-1)
sEndDate = tdate.ToString("dd/MM/yyyy")
Case Else 'Possible Code
If Mid$(sText, 3, 1) = "-" Then ' Department Name
sDept = Trim(sText)
Else
If IsNumeric(Left(sText, 7)) Then 'Got a code
sPLU = Trim(Str(Val(Left(sText, 7))))
'We don't know where the description ends as it contains spaces
'So best way is to start at the end and work back...
iEnd = Len(sText)
iField = 0
For c = iEnd To 9 Step -1
If Not (Mid(sText, c, 1) = " ") Or iField > 10 Then
sTemp = Mid(sText, c, 1) & sTemp
Else
iField = iField + 1
If iField = 9 Then
sValue = sTemp
ElseIf iField = 11 Then
sTots = sTemp
End If
sTemp = ""
End If
Next
If iField = 10 Then
sTots = Trim(sTemp)
sDesc = ""
Else
sDesc = Trim$(sTemp)
End If
'lookup code
productrow = rstx.Select("FileID = 'Chester' and PLU = '" & sPLU & "'")
If productrow.Length = 0 Then ' product not found
iMax = iMax + 1
rstx.Rows.Add(sSchema, sPLU, sDesc, False)
LUse = True
Else
LUse = Not productrow(0)("Exclude")
End If
If (Val(sTots) + Val(sValue) > 0) And LUse Then ' We have a non-zero sale or value and it is not excluded
OutFile.WriteLine(sHotel & "," & sDept & "," & sPLU & "," & sTots & "," & sValue & "," & sEndDate)
End If
End If
End If
End Select
End If
Loop
'dbAdapter.Update(dsTX.Tables(0))
'Close input / output csv files
'rstx.Rows.Add("303030", "Another Test", False)
dbAdapter.UpdateCommand = cmdbuilder.GetUpdateCommand(True)
dbAdapter.InsertCommand = cmdbuilder.GetInsertCommand(True)
dbAdapter.DeleteCommand = cmdbuilder.GetDeleteCommand()
changes = rstx.GetChanges()
If changes IsNot Nothing Then dbAdapter.Update(changes)
InFile.Close()
OutFile.Close()
con.Close()
答案 0 :(得分:-1)
尝试itextSharp。 itextSharp是一个.NET DLL,借助它可以从PDF中提取内容。点击here作为参考&amp;示例代码(尽管代码在c#中,它只是一个给你一个想法的参考)。