如何从网页中提取特定文本

时间:2019-02-13 19:05:52

标签: web-scraping

我对寻找星座图很感兴趣。我正在使用“天空地图” Android应用进行视觉检查,现在我想构建一个应用来查找类似的星座结构。其中的一个子问题是找到特定天体的坐标。

示例:如何获取给定时间,日期和位置的“月球”坐标。

https://theskylive.com/planetarium通过以下方式在其网页上提供此信息。

 Object: Moon [info|live][less]
 Right Asc: 04h 15m 12.5s **Decl: 17° 05' 46.3"** (J2000) [HMS|Dec]
 Magnitude: -10.54 Altitude: 56° Solar Elongation: 100.4° Constellation: Ari 
 Sun distance: 147.77 Million Km Earth distance: 0.38 Million Km
 Rise: 10:48 Transit: 18:40 Set: 01:35 **Europe/London**

对于Moon,我们可以使用网页找到坐标,是否有一些API?或如何通过从网页中提取坐标信息来做到这一点。

4 个答案:

答案 0 :(得分:2)

我不是Android专家,但这是您可以在

中执行的操作

build.gradle

plugins {
    id 'java'
}

group 'test.test'
version '1.0-SNAPSHOT'

sourceCompatibility = 1.8

repositories {
    mavenCentral()
}

dependencies {
    testCompile group: 'junit', name: 'junit', version: '4.12'
    implementation 'com.squareup.okhttp3:okhttp:3.13.1'
    compile group: 'org.json', name: 'json', version: '20180813'
}

Planetarium.java

import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.json.JSONObject;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;

public class Planetarium {
    OkHttpClient client = new OkHttpClient();

    JSONObject get(String... objects) throws IOException {
        SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
        StringBuilder urlBuilder = new StringBuilder("https://theskylive.com/planetariumdata?");

        // Current date in YYYY-MM-DD format
        urlBuilder.append("date=").append(dateFormat.format(new Date()));

        // add url query string for all objects
        for (String obj : objects) {
            // URL encoded aobj[] => aobj%5B%5D=
            urlBuilder.append("&").append("aobj%5B%5D=").append(obj);
        }

        Request request = new Request.Builder()
                .url(urlBuilder.toString())
                .build();

        try (Response response = client.newCall(request).execute()) {
            String json = response.body().string();
            return new JSONObject(json);
        }
    }

    public static void main(String[] args) throws IOException {
        Planetarium planetarium = new Planetarium();
        JSONObject response = planetarium.get("moon", "mars");
        System.out.println(response.toString(2));
    }
}

输出:

{
  "utc_seconds": 1551816600,
  "utc_timestamp": "201903052010",
  "objects": {
    "moon": {
      "distsun": 1.479847408587E8,
      "altitude": -32.421642244539,
      "dec": -12.501182812768,
      "constell": "Cap",
      "timezone": "Europe/London",
      "hlat": "-0.0075",
      "hlong": "163.9072",
      "elongation": "9.6",
      "lastdate": "2019-Mar-05 00:00",
      "hx": -0.95427043393163,
      "hy": 0.26061067578779,
      "mag": "-4.82",
      "hlongRad": 2.8607203077248,
      "hz": -1.6343451194632E-4,
      "utc_time": 1551816600,
      "distearth": 405722.20937018,
      "sot": 350.29647638889,
      "id": "moon",
      "circumstances": {
        "transit_local": 11.428494722983,
        "raise_ut": 1.5517668981849E9,
        "set": 16.623858118962,
        "raise_local": 6.3606069281934,
        "visibility": "partial",
        "azimuth_set": 256.90380469917,
        "LSTs": 3.4997935653561,
        "LSTr": 17.208442522882,
        "set_local": 16.623858118962,
        "azimuth_rise": 104.50312047906,
        "GSTs": 3.4997935653561,
        "GSTr": 17.208442522882,
        "transit_ut": 1.551785142581E9,
        "transit": 11.428494722983,
        "raise": 6.3606069281934,
        "set_ut": 1.5518038458892E9,
        "transit_height": 24.710020581601
      },
      "ar": 22.578738425926,
      "name": "Moon",
      "category": "planets",
      "hlatRad": -1.3089969389957E-4,
      "age": 27,
      "status": true
    },
    "mars": {
      "distsun": 2.2963710671492E8,
      "altitude": 27.808183248664,
      "circumstances": {
        "transit_local": 15.80120694427,
        "raise_ut": 1.5517741680418E9,
        "set": 23.222402283833,
        "raise_local": 8.3800116047075,
        "visibility": "partial",
        "azimuth_set": 286.34760861411,
        "LSTs": 10.11640394619,
        "LSTr": 19.233376146402,
        "set_local": 23.222402283833,
        "azimuth_rise": 73.652391385888,
        "GSTs": 10.11640394619,
        "GSTr": 19.233376146402,
        "transit_ut": 1.551800884345E9,
        "transit": 15.80120694427,
        "raise": 8.3800116047075,
        "set_ut": 1.5518276006482E9,
        "transit_height": 54.867608614112
      },
      "dec": 16.347608614112,
      "constell": "Ari",
      "timezone": "Europe/London",
      "hlat": "0.8142",
      "hlong": "75.6345",
      "elongation": "58.1",
      "lastdate": "2019-Mar-05 00:00",
      "hx": 0.36958631955143,
      "ar": 2.6748900462963,
      "hy": 1.4897081109635,
      "mag": "1.23",
      "hlongRad": 1.3200710530997,
      "hz": 0.022145899657793,
      "utc_time": 1551816600,
      "distearth": 2.704192732295E8,
      "name": "Mars",
      "sot": 58.1002,
      "id": "mars",
      "category": "planets",
      "hlatRad": 0.014210470769738,
      "status": true
    },
    "sun": {
      "distsun": 0,
      "altitude": -22.992657046501,
      "circumstances": {
        "transit_local": 12.176106019167,
        "raise_ut": 1.551767861711E9,
        "set": 17.739026911053,
        "raise_local": 6.6282530456618,
        "visibility": "partial",
        "azimuth_set": 263.93596334029,
        "LSTs": 4.618015588543,
        "LSTr": 17.476821431166,
        "set_local": 17.739026911053,
        "azimuth_rise": 96.242086753282,
        "GSTs": 4.618015588543,
        "GSTr": 17.476821431166,
        "transit_ut": 1.5517878339817E9,
        "transit": 12.176106019167,
        "raise": 6.6282530456618,
        "set_ut": 1.5518078604969E9,
        "transit_height": 32.366908597329
      },
      "dec": -6.0242450863769,
      "constell": "Aqr",
      "timezone": "Europe/London",
      "hlat": "n.a.",
      "hlong": "n.a.",
      "elongation": 0,
      "lastdate": "2019-Mar-05 00:00",
      "hx": 0,
      "ar": 23.060617283951,
      "hy": 0,
      "mag": "-26.76",
      "hlongRad": null,
      "hz": 0,
      "utc_time": 1551816600,
      "distearth": 1.4838474994878E8,
      "name": "Sun",
      "sot": 0,
      "id": "sun",
      "category": "planets",
      "hlatRad": null,
      "status": true
    }
  },
  "target": "sun"
}

答案 1 :(得分:1)

我不确定这是否有帮助,但这是它的python实现。您必须弄清楚可接受的位置参数,但是日期,小时和分钟都在这里:

import requests

url = 'https://theskylive.com/planetariumdata'

params = {
'obj': 'moon',
'h': '10',
'm': '30',
'date': '2019-02-28',
'localdata': '51.48|0|Greenwich, United Kingdom|Europe/London'}

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}

response = requests.get(url, params=params, headers=headers).json()

print (response['objects']['moon'])

输出:

{'status': True, 'utc_time': 1551349800, 'ar': 18.114288194444, 'dec': -21.301003146701, 'mag': '-9.11', 'distsun': 148031243.76562, 'distearth': 399053.81054688, 'constell': 'Oph', 'sot': 292.2907375, 'lastdate': '2019-Feb-28 00:00', 'hlong': '158.9866', 'hlongRad': 2.7748396365512, 'hlat': '0.0060', 'hlatRad': 0.00010471975511966, 'hx': -0.92639216172362, 'hy': 0.34779595586615, 'hz': 8.4403227939488e-05, 'elongation': '67.7', 'altitude': 7.7566655880485, 'id': 'moon', 'name': 'Moon', 'category': 'planets', 'circumstances': {'visibility': 'partial', 'raise': 3.0419974882059, 'set': 11.875359660362, 'transit': 7.4771821984014, 'raise_ut': 1551322951.191, 'set_ut': 1551354751.2948, 'transit_ut': 1551338917.8559, 'transit_height': 17.341269275926, 'azimuth_rise': 110.98610232928, 'azimuth_set': 248.66063774998, 'LSTr': 13.552197907652, 'LSTs': 22.409745024956, 'GSTr': 13.552197907652, 'GSTs': 22.409745024956, 'raise_local': 3.0419974882059, 'set_local': 11.875359660362, 'transit_local': 7.4771821984014}, 'timezone': 'Europe/London', 'age': 23}

答案 2 :(得分:0)

下面您可以找到有关如何在python中执行此操作的代码。有很多方法可以将代码合并到应用程序中。仅出于说明目的,我将结果投射到了熊猫数据框中,以便您可以查看结果。我还添加了一些代码来处理代理设置,如果不是这样,则可以将其省略,而仅通过请求包获取url文本。

希望有帮助。

import urllib
import pandas as pd
import numpy as np

username = 'userID'  # ex. ID
password = "password!"  # password

dataURL = "https://theskylive.com/moon-info"

proxies = {
    'https':  'https://{}:{}@proxy:port'.format(username, password)}
proxy = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(proxy)
urllib.request.install_opener(opener)

with urllib.request.urlopen(dataURL) as url:
    text = str(url.read())

tableStart = text.find('The Moon Ephemeris')
tableEnd = text.find('Distance of The Moon from Earth')
tableProgress = tableStart

findSTR = 'moon&date='
loc = text.find(findSTR,tableStart,tableEnd)
startDate = text[loc+len(findSTR):loc+len(findSTR)+10] 

table = []
tableRow = []
counter = 0
counter2 = 0

diff = [20,4]

while loc>0:

    loc1 = text.find('<td class="desktop">',tableProgress,tableEnd)
    loc2 = text.find('<td>',tableProgress,tableEnd)

    if loc1<0:
        if loc2<0:
            loc = -1
        else:
            loc = loc2
            pos = 1
    else:
        if loc2<0:
            loc = loc1
            pos = 0
        else:            
            loc = np.min([loc1,loc2])
            pos = np.argmin([loc1,loc2])


    if loc>0:



        locStart = loc+diff[pos]
        loc = text.find('</td>',loc,tableEnd)

        if loc>0:
            extractedText = text[locStart:loc]

            if counter ==1:

                extractedText = extractedText.replace('&deg;',' deg')
                extractedText = extractedText.replace('&rsquo;',' min')
                extractedText = extractedText.replace('&rdquo;',' sec')
            elif counter ==3:                
                extractedText = extractedText.replace('&rdquo;',' arcsec')
            tableRow = tableRow+ [extractedText]
            tableProgress = loc


            counter = counter+1
            if counter==5:
                counter2 = counter2+1                
                counter = 0
                table = table+[tableRow]
                tableRow = []


idx = pd.date_range(start='2019-02-26', periods=len(table), freq='D')
cols = ['Right Ascension','Declination','Magnitude','Apparent Diameter','Constellation']

Data = pd.DataFrame(table,index=idx,columns=cols)
print(Data)

答案 3 :(得分:0)

只需设置您自己的数据抓取服务器,您就可以使用IFTTT来获取数据并首先存储它。

以下是一些不错的教程:https://public.tableau.com/en-us/s/blog/2013/08/data-scraping-part-i-ifttt

我确定您不希望您的应用进行抓取。 将其存储后,您可以使用任何语言进一步对其进行操作,并以例如json格式作为应用程序的可缓存数据源。