自动化PDF到文本VB.net

时间:2015-05-13 12:11:01

标签: vb.net pdf

我目前正在VB.Net控制台应用程序中使用以下代码,该应用程序获取文本文件的内容并提取某些信息,然后将其导出为CSV。

所有似乎都运行良好,但问题是文件最初是以PDF形式出现的(只有选项),我必须在Adobe手动打开文件并“另存为文本”。

是否有办法自动将PDF转换为文本文件或读取PDF代替文本文件。

任何指导或选择都将不胜感激

Dim iLine, iEnd, c, iField As Integer
    Dim iSecs, iMax As Long
    Dim sText, sTemp, sSchema As String
    Dim sHotel, sEndDate, sMon, sPLU, sTots, sValue, sDept, sFile, sOutFile, sDesc As String
    Dim tdate As Date
    Dim con As New OleDbConnection("Provider=Microsoft.ACE.OLEDB.12.0; Data Source=C:\temp\TX.accdb;")
    Dim LUse As Boolean


    sHotel = "Unknown Hotel"
    sEndDate = "01/01/2015"
    sMon = "MAR"
    sPLU = ""
    sTots = "0"
    sValue = "0"
    sDept = "Unknown Dept"
    sDesc = ""
    LUse = True
    sTemp = ""
    iField = 0
    sSchema = "Chester"


    'Open input file
    sFile = "c:\temp\input.txt"
    Dim InFile As New System.IO.StreamReader(sFile)

    'Open lookup data table
    con.Open()
    Dim dbAdapter As OleDbDataAdapter = New OleDbDataAdapter( _
      "SELECT * FROM Plookup", con)
    Dim dsTX As DataSet = New DataSet()
    Dim changes As DataTable
    Dim cmdbuilder As OleDbCommandBuilder = New OleDbCommandBuilder(dbAdapter)

    dbAdapter.FillSchema(dsTX, SchemaType.Source, "Plookup")
    dbAdapter.Fill(dsTX, "Plookup")

    Dim rstx As DataTable = dsTX.Tables(0)
    iMax = rstx.Rows.Count
    Dim productrow() As Data.DataRow





    'Open Output file
    iSecs = Timer
    sOutFile = "c:\temp\TX" & Format$(Now, "yymmdd") & Trim$(Str$(iSecs)) & ".csv"
    FileCopy(sFile, "c:\temp\TX" & Format$(Now, "yymmdd") & Trim$(Str$(iSecs)) & ".txt")
    Dim OutFile As New System.IO.StreamWriter(sOutFile)
    'Write header
    OutFile.WriteLine("outlet,dept,epos,tots sold,total price,date of sales")

    iLine = 0
    Do While InFile.Peek() <> -1
        'Read in text
        iLine = iLine + 1

        sText = InFile.ReadLine
        sText = sText.Replace(",", "")
        If Len(sText) > 2 And Len(sText) < 9 Then
            If Mid$(sText, 3, 1) = "-" Then ' Department Name
                sText = sText & Space(9 - Len(sText))
            End If
        End If


        'Process all rows except header row - read data into array
        If Len(sText) > 8 Then
            Select Case Left(sText, 7)

                Case "Consoli"  ' Ignore

                Case "Quanti "  ' Ignore

                Case "Group b"  ' Ignore - but next row is the Hotel Name
                    iLine = iLine + 1
                    sText = InFile.ReadLine
                    sText = sText.Replace(",", "")
                    sHotel = Trim$(Left(sText, 20)) 'The username follows so we may truncate the hotel name

                Case "Date ra" ' End date

                    sEndDate = Mid$(sText, 29, 2) & "/" & Mid$(sText, 32, 2) & "/" & Mid$(sText, 35, 4)
                    tdate = CDate(sEndDate).AddDays(-1)

                    sEndDate = tdate.ToString("dd/MM/yyyy")

                Case Else   'Possible Code

                    If Mid$(sText, 3, 1) = "-" Then ' Department Name
                        sDept = Trim(sText)
                    Else
                        If IsNumeric(Left(sText, 7)) Then 'Got a code 
                            sPLU = Trim(Str(Val(Left(sText, 7))))
                            'We don't know where the description ends as it contains spaces
                            'So best way is to start at the end and work back...
                            iEnd = Len(sText)
                            iField = 0
                            For c = iEnd To 9 Step -1
                                If Not (Mid(sText, c, 1) = " ") Or iField > 10 Then

                                    sTemp = Mid(sText, c, 1) & sTemp

                                Else
                                    iField = iField + 1
                                    If iField = 9 Then
                                        sValue = sTemp

                                    ElseIf iField = 11 Then
                                        sTots = sTemp

                                    End If
                                    sTemp = ""
                                End If

                            Next

                            If iField = 10 Then
                                sTots = Trim(sTemp)
                                sDesc = ""
                            Else
                                sDesc = Trim$(sTemp)
                            End If

                            'lookup code
                            productrow = rstx.Select("FileID = 'Chester' and PLU = '" & sPLU & "'")
                            If productrow.Length = 0 Then ' product not found
                                iMax = iMax + 1
                                rstx.Rows.Add(sSchema, sPLU, sDesc, False)

                                LUse = True
                            Else
                                LUse = Not productrow(0)("Exclude")
                            End If


                            If (Val(sTots) + Val(sValue) > 0) And LUse Then  ' We have a non-zero sale or value and it is not excluded
                                OutFile.WriteLine(sHotel & "," & sDept & "," & sPLU & "," & sTots & "," & sValue & "," & sEndDate)
                            End If
                        End If
                    End If


            End Select
        End If
    Loop

    'dbAdapter.Update(dsTX.Tables(0))
    'Close input / output csv files
    'rstx.Rows.Add("303030", "Another Test", False)
    dbAdapter.UpdateCommand = cmdbuilder.GetUpdateCommand(True)
    dbAdapter.InsertCommand = cmdbuilder.GetInsertCommand(True)
    dbAdapter.DeleteCommand = cmdbuilder.GetDeleteCommand()
    changes = rstx.GetChanges()

    If changes IsNot Nothing Then dbAdapter.Update(changes)


    InFile.Close()
    OutFile.Close()
    con.Close()

1 个答案:

答案 0 :(得分:-1)

尝试itextSharp。 itextSharp是一个.NET DLL,借助它可以从PDF中提取内容。点击here作为参考&amp;示例代码(尽管代码在c#中,它只是一个给你一个想法的参考)。