在Python中从文件中提取特定数据

时间:2018-03-28 05:33:04

标签: python database python-2.7 pandas machine-learning

  

我有一个数据文件: -

sample = { "_id" : { "$oid" : "57d3a7f6f1a4758c497b23eb" }, "location" : "/contact-us.html", "host" : "www.fullestop.com",
       "h" : "1243", "url" : "https://www.fullestop.com/contact-us.html", "ip" : "146.196.34.138", "beta_user" : 0,
       "referrer" : "https://www.fullestop.com/india/web-design-expert/index.php", "country" : [ "India" ], "browser" :
       "Chrome", "os" : "Unknown OS Platform", "created_date" : { "$date" : 1473445800000 },
       "user_cookie" : "199CB2382B29EC5C32540EBEC7E2413FC1D4B620", "session_id" : "BE2189EDE56CE7C5453627F639AAA1128195E27A",
       "total_visited_pages" : 4, "modified" : { "$date" : 1473488886962 }, "created" : { "$date" : 1473488886963 },
       "visited_pages" : [ { "visited_page_id" : { "$oid" : "57d3a7f6f1a4758c497b23ea" }, "url" : "https://www.fullestop.com/contact-us.html",
       "page_height" : "1243", "visited_on" : { "$date" : 1473488886000 },
       "visited_page_clicks" : [ { "x" : "108", "y" : "204", "page_height" : "1243", "created" : { "$date" : 1473488890000 } },
       { "x" : "396", "y" : "345", "page_height" : "1243", "created" : { "$date" : 1473488892000 } },
       { "x" : "209", "y" : "37", "page_height" : "1243", "created" : { "$date" : 1473488900000 } } ,
       { "x" : "599", "y" : "586", "page_height" : "4000", "created" : { "$date" : 1473488909000 } } ],
       "total_clicks" : 4, "total_time_spent_in_minutes" : "0.23", "total_mouse_moves" : 0 },
      { "visited_page_id" : { "$oid" : "57d3a80df1a475a6377b240d" }, "url" : "https://www.fullestop.com/",
      "page_height" : "4000", "visited_on" : { "$date" : 1473488909000 },
      "visited_page_clicks" : [ { "x" : "611", "y" : "587", "page_height" : "4000", "created" : { "$date" : 1473488911000 } },
     { "x" : "642", "y" : "591", "page_height" : "4000", "created" : { "$date" : 1473488913000 } },
     { "x" : "661", "y" : "592", "page_height" : "4000", "created" : { "$date" : 1473488915000 } },
     { "x" : "680", "y" : "588", "page_height" : "4000", "created" : { "$date" : 1473488916000 } },
     { "x" : "657", "y" : "589", "page_height" : "4000", "created" : { "$date" : 1473488918000 } },
     { "x" : "711", "y" : "588", "page_height" : "4000", "created" : { "$date" : 1473488926000 } },
     { "x" : "732", "y" : "587", "page_height" : "4000", "created" : { "$date" : 1473488929000 } },
     { "x" : "754", "y" : "583", "page_height" : "4000", "created" : { "$date" : 1473488933000 } },
     { "x" : "776", "y" : "587", "page_height" : "4000", "created" : { "$date" : 1473488935000 } },
     { "x" : "941", "y" : "30", "page_height" : "4000", "created" : { "$date" : 1473488938000 } } ],
    "total_clicks" : 10, "total_time_spent_in_minutes" : "0.34", "total_mouse_moves" : 0 },
     { "visited_page_id" : { "$oid" : "57d3a82ff1a475a6377b240e" }, "url" : "https://www.fullestop.com/company-profile.html",
    "page_height" : "5267", "visited_on" : { "$date" : 1473488943000 },
    "visited_page_clicks" : [ { "x" : "1277", "y" : "386", "page_height" : "5267", "created" : { "$date" : 1473488950000 } },
    { "x" : "1365", "y" : "364", "page_height" : "5197", "created" : { "$date" : 1473488962000 } },
    { "x" : "1363", "y" : "1828", "page_height" : "5197", "created" : { "$date" : 1473488971000 } },
    { "x" : "1365", "y" : "3542", "page_height" : "5197", "created" : { "$date" : 1473488977000 } },
    { "x" : "1365", "y" : "4602", "page_height" : "5197", "created" : { "$date" : 1473488979000 } },
    { "x" : "1365", "y" : "2009", "page_height" : "5197", "created" : { "$date" : 1473488985000 } },
    { "x" : "1365", "y" : "1463", "page_height" : "5197", "created" : { "$date" : 1473488985000 } },
    { "x" : "1365", "y" : "843", "page_height" : "5197", "created" : { "$date" : 1473488985000 } },
    { "x" : "1365", "y" : "308", "page_height" : "5197", "created" : { "$date" : 1473488986000 } },
    { "x" : "1365", "y" : "260", "page_height" : "5267", "created" : { "$date" : 1473488986000 } } ],
    "total_clicks" : 10, "total_time_spent_in_minutes" : "8.35", "total_mouse_moves" : 0 },
    { "visited_page_id" : { "$oid" : "57d3aa32f1a4758c497b23ec" }, "url" : "https://www.fullestop.com/contact-us.html",
    "page_height" : "1243", "visited_on" : { "$date" : 1473489458000 }, "total_time_spent_in_minutes" : "0.0",
    "total_mouse_moves" : 0 } ], "total_time_spent_in_minutes" : "9.32" }
  

它的输出应该是: -

这里,从上面取得的会话ID对于所有值都是相同的。发生的变化仅在X,Y值提取时发生。每个网址都有多个X,Y值,因此应该以这种格式显示每个网址的X,Y值: enter image description here

如何解决这个复杂的数据问题?

1 个答案:

答案 0 :(得分:0)

以下是您的问题的解决方案:

def printTable (tbl, borderHorizontal = '-', borderVertical = '|', borderCross = '+'):
    cols = [list(x) for x in zip(*tbl)]
    lengths = [max(map(len, map(str, col))) for col in cols]
    f = borderVertical + borderVertical.join(' {:>%d} ' % l for l in lengths) + borderVertical
    s = borderCross + borderCross.join(borderHorizontal * (l+2) for l in lengths) + borderCross

    print(s)
    for row in tbl:
        print(f.format(*row))
        print(s)


sample = { "_id" : { "$oid" : "57d3a7f6f1a4758c497b23eb" }, "location" : "/contact-us.html", "host" : "www.fullestop.com",
       "h" : "1243", "url" : "https://www.fullestop.com/contact-us.html", "ip" : "146.196.34.138", "beta_user" : 0,
       "referrer" : "https://www.fullestop.com/india/web-design-expert/index.php", "country" : [ "India" ], "browser" :
       "Chrome", "os" : "Unknown OS Platform", "created_date" : { "$date" : 1473445800000 },
       "user_cookie" : "199CB2382B29EC5C32540EBEC7E2413FC1D4B620", "session_id" : "BE2189EDE56CE7C5453627F639AAA1128195E27A",
       "total_visited_pages" : 4, "modified" : { "$date" : 1473488886962 }, "created" : { "$date" : 1473488886963 },
       "visited_pages" : [ { "visited_page_id" : { "$oid" : "57d3a7f6f1a4758c497b23ea" }, "url" : "https://www.fullestop.com/contact-us.html",
       "page_height" : "1243", "visited_on" : { "$date" : 1473488886000 },
       "visited_page_clicks" : [ { "x" : "108", "y" : "204", "page_height" : "1243", "created" : { "$date" : 1473488890000 } },
       { "x" : "396", "y" : "345", "page_height" : "1243", "created" : { "$date" : 1473488892000 } },
       { "x" : "209", "y" : "37", "page_height" : "1243", "created" : { "$date" : 1473488900000 } } ,
       { "x" : "599", "y" : "586", "page_height" : "4000", "created" : { "$date" : 1473488909000 } } ],
       "total_clicks" : 4, "total_time_spent_in_minutes" : "0.23", "total_mouse_moves" : 0 },
      { "visited_page_id" : { "$oid" : "57d3a80df1a475a6377b240d" }, "url" : "https://www.fullestop.com/",
      "page_height" : "4000", "visited_on" : { "$date" : 1473488909000 },
      "visited_page_clicks" : [ { "x" : "611", "y" : "587", "page_height" : "4000", "created" : { "$date" : 1473488911000 } },
     { "x" : "642", "y" : "591", "page_height" : "4000", "created" : { "$date" : 1473488913000 } },
     { "x" : "661", "y" : "592", "page_height" : "4000", "created" : { "$date" : 1473488915000 } },
     { "x" : "680", "y" : "588", "page_height" : "4000", "created" : { "$date" : 1473488916000 } },
     { "x" : "657", "y" : "589", "page_height" : "4000", "created" : { "$date" : 1473488918000 } },
     { "x" : "711", "y" : "588", "page_height" : "4000", "created" : { "$date" : 1473488926000 } },
     { "x" : "732", "y" : "587", "page_height" : "4000", "created" : { "$date" : 1473488929000 } },
     { "x" : "754", "y" : "583", "page_height" : "4000", "created" : { "$date" : 1473488933000 } },
     { "x" : "776", "y" : "587", "page_height" : "4000", "created" : { "$date" : 1473488935000 } },
     { "x" : "941", "y" : "30", "page_height" : "4000", "created" : { "$date" : 1473488938000 } } ],
    "total_clicks" : 10, "total_time_spent_in_minutes" : "0.34", "total_mouse_moves" : 0 },
     { "visited_page_id" : { "$oid" : "57d3a82ff1a475a6377b240e" }, "url" : "https://www.fullestop.com/company-profile.html",
    "page_height" : "5267", "visited_on" : { "$date" : 1473488943000 },
    "visited_page_clicks" : [ { "x" : "1277", "y" : "386", "page_height" : "5267", "created" : { "$date" : 1473488950000 } },
    { "x" : "1365", "y" : "364", "page_height" : "5197", "created" : { "$date" : 1473488962000 } },
    { "x" : "1363", "y" : "1828", "page_height" : "5197", "created" : { "$date" : 1473488971000 } },
    { "x" : "1365", "y" : "3542", "page_height" : "5197", "created" : { "$date" : 1473488977000 } },
    { "x" : "1365", "y" : "4602", "page_height" : "5197", "created" : { "$date" : 1473488979000 } },
    { "x" : "1365", "y" : "2009", "page_height" : "5197", "created" : { "$date" : 1473488985000 } },
    { "x" : "1365", "y" : "1463", "page_height" : "5197", "created" : { "$date" : 1473488985000 } },
    { "x" : "1365", "y" : "843", "page_height" : "5197", "created" : { "$date" : 1473488985000 } },
    { "x" : "1365", "y" : "308", "page_height" : "5197", "created" : { "$date" : 1473488986000 } },
    { "x" : "1365", "y" : "260", "page_height" : "5267", "created" : { "$date" : 1473488986000 } } ],
    "total_clicks" : 10, "total_time_spent_in_minutes" : "8.35", "total_mouse_moves" : 0 },
    { "visited_page_id" : { "$oid" : "57d3aa32f1a4758c497b23ec" }, "url" : "https://www.fullestop.com/contact-us.html",
    "page_height" : "1243", "visited_on" : { "$date" : 1473489458000 }, "total_time_spent_in_minutes" : "0.0",
    "total_mouse_moves" : 0 } ], "total_time_spent_in_minutes" : "9.32" }

x = [['session', 'url', 'x' , 'y', 'Total Clicks']]

for visitedPage in sample['visited_pages']:
    if 'visited_page_clicks' in visitedPage :
            for clicks in visitedPage['visited_page_clicks']:
                arrayOfValues=[]
                arrayOfValues = [sample['session_id']]
                arrayOfValues.append(visitedPage['url'])

                arrayOfValues.append(clicks['x'])
                arrayOfValues.append(clicks['y'])

                if 'total_clicks' in visitedPage :     
                    arrayOfValues.append(str(visitedPage['total_clicks']))
                else:
                    arrayOfValues.append('-')
                x.append(arrayOfValues)

            continue


    else:
        arrayOfValues=[]
        arrayOfValues = [sample['session_id']]
        arrayOfValues.append(visitedPage['url'])
        arrayOfValues.append('-')
        arrayOfValues.append('-')
        if 'total_clicks' in visitedPage :     
            arrayOfValues.append(str(visitedPage['total_clicks']))
        else:
            arrayOfValues.append('-')


    x.append(arrayOfValues)

printTable(x)

输出

enter image description here