努力将传统的webscraping纳入ROR应用程序

时间:2017-03-05 08:46:20

标签: ruby-on-rails ruby

遇到了这个awesome tutorial,它教会了如何通过终端 ROR应用程序(第一次为我)...

所以,试图打破这一点并让它得到模型本身的支持,这样我就可以每天一次或者其他东西显示前3名(除了教育价值之外,这只是玩耍而不是为了任何其他目的而废弃)。

我很困惑如何使用Entry模型支持其传统操作,即索引,创建等...即,所以我每天可以保存3个顶级标题和链接,所以当你去索引页面,您可以看到不同日期的前三个链接

目前正如此:

控制器

class EntryController < ApplicationController

  # Define the Entry object

  def scrape_reddit
    require 'open-uri'
    doc = Nokogiri::HTML(open("https://www.reddit.com/"))

    entries = doc.css('.entry')
    @entriesArray = []
    entries.each do |entry|
      title = entry.css('p.title > a').text
      link = entry.css('p.title > a')[0]['href']
      @entriesArray << Entry.new(title, link)
    end

    # render template: 'scrape_reddit'
  end
end

scrape_reddit.html.erb

<h1>Reddit's Front Page</h1>
<% @entriesArray.each do |entry| %>
  <p><%= entry.title %></p>
  <p><%= entry.link %></p>
<% end %>

路由

Rails.application.routes.draw do
    root 'entry#scrape_reddit'
end

条目模式

class Entry < ApplicationRecord
        attr_reader :title
        attr_reader :link

        def initialize(title, link)
              @title = title
              @link = link
        end
end

模式

class CreateEntries < ActiveRecord::Migration[5.0]
  def change
    create_table :entries do |t|
      t.string :title
      t.string :link

      t.timestamps
    end
  end
end

当我试图用“传统”ROR重拍它时......显然我没有抓住我的基础......

模型

仍与上述相同

路由

Rails.application.routes.draw do
    root 'entry#index'
end

“传统”控制器

class EntryController < ApplicationController

    require 'open-uri'
    doc = Nokogiri::HTML(open("https://www.reddit.com/"))
    @oy = doc.css('.entry')

  def index
    @entries = Entry.all

  end


  def show

  end


  def new
    @entry = Entry.new
  end


  def edit
  end


  def create
    @entry = Entry.new(entry_params)

    respond_to do |format|
      if @entry.save
        format.html { redirect_to @entry, notice: 'entry was successfully created.' }
        format.json { render :show, status: :created, location: @entry }
      else
        format.html { render :new }
        format.json { render json: @entry.errors, status: :unprocessable_entity }
      end
    end
  end

  def update
    respond_to do |format|
      if @entry.update(entry_params)
        format.html { redirect_to @entry, notice: 'entry was successfully updated.' }
        format.json { render :show, status: :ok, location: @entry }
      else
        format.html { render :edit }
        format.json { render json: @entry.errors, status: :unprocessable_entity }
      end
    end
  end


  def destroy
    @entry.destroy
    respond_to do |format|
      format.html { redirect_to tippies_url, notice: 'entry was successfully destroyed.' }
      format.json { head :no_content }
    end
  end

  private
    def set_entry
      @entry = Entry.find(params[:id])
    end

    def entry_params
      params.require(:entry).permit(:title, :link)
    end
end

index.html.erb

<h1>Reddit Entries</h1>

<table>

  <tbody>
    <% @entries.each do |entry| %>
      <tr>
        <td><%= entry.title.css('p.title > a').text %></td>
        <td><%= entry.link.css('p.title > a')[0]['href'] %></td>
      </tr>
    <% end %>
  </tbody>
</table>


再次编辑

CONTROLLER

class EntryController < ApplicationController

  def scrape
      require 'open-uri'
      doc = Nokogiri::HTML(open("https://www.reddit.com/"))

      entries = doc.css('.entry')
      @entriesArray = []
      entries.each do |entry|
      title = entry.css('p.title > a').text
      link = entry.css('p.title > a')[0]['href']
      @entriesArray << Entry.new(title, link)
  end

  # Here I would just save the entries
  # You could perform validation
  if entriesArray.map(&:valid?)
    entriesArray.map(&:save!)
  end
end


  def index
    @entries = Entry.all

  end


  def show

  end


  def new
    @entry = Entry.new
  end


  def edit
  end


  def create
    @entry = Entry.new(entry_params)

    respond_to do |format|
      if @entry.save
        format.html { redirect_to @entry, notice: 'entry was successfully created.' }
        format.json { render :show, status: :created, location: @entry }
      else
        format.html { render :new }
        format.json { render json: @entry.errors, status: :unprocessable_entity }
      end
    end
  end

  def update
    respond_to do |format|
      if @entry.update(entry_params)
        format.html { redirect_to @entry, notice: 'entry was successfully updated.' }
        format.json { render :show, status: :ok, location: @entry }
      else
        format.html { render :edit }
        format.json { render json: @entry.errors, status: :unprocessable_entity }
      end
    end
  end


  def destroy
    @entry.destroy
    respond_to do |format|
      format.html { redirect_to tippies_url, notice: 'entry was successfully destroyed.' }
      format.json { head :no_content }
    end
  end

  private
    def set_entry
      @entry = Entry.find(params[:id])
    end

    def entry_params
      params.require(:entry).permit(:title, :link)
    end
end

路线

Rails.application.routes.draw do
    root 'entry#scrape_reddit'

    get '/new_entries', to: 'entries#scrape', as: 'scrape'
end

ENTRY'S INDEX.HTML.ERB

<h1>Reddit Entries</h1>

<table>

  <tbody>
    <% @entriesArray.each do |entry| %>
      <tr>
        <td><%= entry.title.css('p.title > a').text %></td>
        <td><%= entry.link.css('p.title > a')[0]['href'] %></td>
      </tr>
    <% end %>
  </tbody>
</table>

第三次编辑

enter image description here

第四次编辑

控制器

class EntryController < ApplicationController


def index
  @entries = Entry.all
end

def scrape
    require 'open-uri'
    doc = Nokogiri::HTML(open("https://www.reddit.com/"))

    entries = doc.css('.entry')
    entriesArray = []
    entries.each do |entry|
      title = entry.css('p.title > a').text
      link = entry.css('p.title > a')[0]['href']
      entriesArray << Entry.new({ title: title, link: link })
    end

    # Here I would just save the entries
    # You could perform validation
    if entriesArray.map(&:valid?)
      entriesArray.map(&:save!)
    end

    respond_to do |format|
      format.html { redirect_to entries_url, notice: 'Entries were successfully scraped.' }
      format.json { entriesArray.to_json }
    end
  end

end

index.html.erb

<h1>Reddit's Front Page</h1>
<% @entries.each do |entry| %>
  <p><%= entry.title %></p>
  <p><%= entry.link %></p>
<% end %>

路由

Rails.application.routes.draw do
    #root 'entry#scrape_reddit'
    root 'entry#index'
    resources :entries
end

模型

class Entry < ApplicationRecord
        attr_reader :title
    attr_reader :link

        def initialize(title, link)
              @title = title
              @link = link
        end

end

1 个答案:

答案 0 :(得分:0)

我会使用像whenever这样的宝石每天在每个HH:MM中运行一个cron作业。它需要从你的Entry控制器运行一个方法,比如scrape,它将包含抓取逻辑。

我会移动:

require 'open-uri'
doc = Nokogiri::HTML(open("https://www.reddit.com/"))
@oy = doc.css('.entry')

到新创建的scrape方法并对其进行刮擦,填充模型并保存它们。像这样的东西(实际上你的方法很有效):

编辑2:

我改变了方法并试了一下。我确实工作并保存条目。

def scrape
    require 'open-uri'
    doc = Nokogiri::HTML(open("https://www.reddit.com/"))

    entries = doc.css('.entry')
    entriesArray = []
    entries.each do |entry|
      title = entry.css('p.title > a').text
      link = entry.css('p.title > a')[0]['href']
      entriesArray << Entry.new({ title: title, link: link })
    end

    # Here I would just save the entries
    # You could perform validation
    if entriesArray.map(&:valid?)
      entriesArray.map(&:save!)
    end

    respond_to do |format|
      format.html { redirect_to entries_url, notice: 'Entries were successfully scraped.' }
      format.json { entriesArray.to_json }
    end
  end

然后,当您访问索引页面时,您可以看到新创建的条目。

编辑:

为scrape方法添加一个路径,就像这样

get '/new_entries', to: 'entries#scrape', as: 'scrape'

如果您访问新条目,则应将新条目保存到您的数据库中。