熊猫读取html选择特定表值

时间:2020-07-06 10:43:30

标签: python pandas

如何使用python pandas选择特定的HTML表TH,例如,通过解析HTML页面,使用python panda代码从TAG 2.4.33的表中进行选择

root@1ec99b8b97af:/opt# python lookuptag.py 
             Id                              Tag        Created       Layers      Size  Delete
0   bb84b573f76                           2.4.33       2 years ago     22  179.6 MB  Delete
1   bb84b573f76                           2.4.33-t2    2 years ago     22  179.6 MB  Delete
2   5c97c0e3531                        v8-2.4.33       1 year ago      22  180.7 MB  Delete

这是我的Python熊猫代码,我可以使用该代码打印HTML

import requests
import pandas as pd

url = 'http://docker-registry:8080/repo/tags/httpd'
html = requests.get(url).content
df_list = pd.read_html(html, header =0, flavor = 'bs4')

df = df_list[-1]
print(df)

1 个答案:

答案 0 :(得分:0)

        def FetchTable(context,tablexpath): 
                url = 'https://www.espncricinfo.com/table/series/8048/season/2020/indian-premier-league'
                tables = pd.read_html(url)
                table = tables[0].applymap(str)
                return table
        
        def LookupValueInColumnTwoKeys(context, source_table, reference_column_1, reference_value_1, reference_column_2, reference_value_2, lookup_column):
                lookup_column = lookup_column.replace(' ', '')
                reference_value_2 = reference_value_2.replace(' ', '')
                reference_value_1 = reference_value_1.replace(' ', '')
                referenceindex=0
                referenceindex1=0
                referenceindexfound=False
                referenceindexfound1=False
                lookupcolumnindex =0
                rowindex=0
                rowindexfound=False
                lookupcolumnindexfound=False
                for headers in source_table.columns:
                    if referenceindexfound == False:
                        referenceindex=referenceindex+1
                    if referenceindexfound1 == False:
                        referenceindex1=referenceindex1+1    
                    if lookupcolumnindexfound == False:    
                        lookupcolumnindex=lookupcolumnindex+1    
                    if headers == reference_column_1 :
                        referenceindexfound = True
                    if headers == reference_column_2 :
                        referenceindexfound1 = True    
                    if headers == lookup_column:
                        lookupcolumnindexfound = True
                    if referenceindexfound == True & lookupcolumnindexfound == True & referenceindexfound1 == True:
                        break
        
                for tablerow in source_table.values:
                    print(tablerow)
                    if rowindexfound == False: 
                        print(tablerow[referenceindex-1])
                        print(tablerow[referenceindex1-1])
                        if tablerow[referenceindex-1].find(reference_value_1)!= -1 and tablerow[referenceindex1-1].find(reference_value_2)!= -1 :  
                            rowindexfound = True
                            #rowindex=rowindex+1
                        else:
                            rowindex=rowindex+1    
                    else:
                        break
        
                print("source table"+source_table.values[rowindex][lookupcolumnindex-1])           
                return source_table.values[rowindex][lookupcolumnindex-1]
            
    Another files
    
    from behave import *
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as ec
    from readTableDataFromDB import readTableDataFromDB
    from pandacode import WebTableValidationHelper as pandacode
    from selenium.webdriver.chrome.options import Options
    context.driver.get("https://www.espncricinfo.com/table/series/8048/season/2020/indian-premier-league")
        matrix = pandacode.FetchTable(context,"//*[@class='table table-sm standings-widget-table text-center mb-0 border-bottom']")
        3ismatrixequal = pandacode.VerifyTable(context,matrix,matrix)
        #print(ismatrixequal)
        lookupvalue = pandacode.LookupValueFromColumnSingleKey(context,matrix,"TEAM", "Delhi Capitals", "PT")
        print(lookupvalue)
        



another code:

def LookupValueFromColumnSingleKey1(context, source_table,reference_column_1, rowName, columnName):
        referenceindex=0
        referenceindexfound=False
        columnindex =0
        rowindex=0
        rowindexfound=False
        columnindexfound=False
        for headers in source_table.columns:
            if referenceindexfound == False:
                referenceindex= referenceindex+1
            if columnindexfound == False:    
                columnindex= columnindex+1    
            if headers == reference_column_1 :
                referenceindexfound = True
            if headers == columnName:
                columnindexfound = True
            if referenceindexfound == True & columnindexfound == True:
                break

        for tablerow in source_table.values:
            #print(tablerow)
            if rowindexfound == False: 
                rowindex=rowindex+1
            for tupledata in tablerow:
                    #if tupledata.find(rowName)!= -1:  c
                    if tupledata.lower() == rowName.lower():
                        print(tupledata)
                        rowindexfound = True

        #print("source table"+source_table.values[rowindex-1][columnindex-1])
        #print(source_table[columnindex][rowindex])           
        return source_table.values[rowindex-1][columnindex-1]