如何使用pandas在整个数据框中搜索部分String?

时间:2017-05-29 07:47:25

标签: pandas

如何使用pandas在每个列上搜索字符串值。 假设我有32列,

class Venue < ActiveRecord::Base
  nilify_blanks
  attr_accessor   :misc_info_raw

  def misc_info_raw
    self.misc_info.join("\n, ") unless self.misc_info.nil?
  end

  def misc_info_raw=(values)
    self.misc_info = []
    self.misc_info = values
  end
  # Geocoding
  geocoded_by :full_address do |venue, results|
    if geo = results.first
      venue.latitude = geo.latitude
      venue.longitude = geo.longitude
      venue.district = geo.neighborhood if geo.neighborhood
    end
  end

  after_validation :geocode, :if => :address_changed?

  CLUB_TYPES = [nil, ‘xxx’, ‘xxxx’, ‘xxxx’,
    ‘Xx’, ‘xx xxxx’, ‘xxx xxx’]

    belongs_to :subdivision
    has_many :events
    has_many :stake_holders
    has_one :thumbnail_image
    has_many :inventories, :dependent => :destroy
    has_many :promotions
    has_many :claimed_promotions

    has_many :reviews
    has_many :images, :dependent => :destroy
    has_many :sections, :dependent => :destroy

    accepts_nested_attributes_for :subdivision,
    :images, :inventories,
    :thumbnail_image, :sections, :allow_destroy => true

    validates :name,
    :presence => true,
    :length => { :maximum => 30 }

    validates :misc_info, :about,
    :length => { :maximum => 255 }

    validates :club_type,
    :about,
    :percent_discount,
    :bt_cut,
    :district,
    :email, :presence => true

    #cell_image is used instead of thumbnail_image for versions < 2.0
    has_attached_file :cell_image,
    :url => "/thumbnails/:id/:style/:basename.:extension",
    :path => "/thumbnails/:id/:style/:basename.:extension"

    validates_attachment_content_type :cell_image, :content_type => /\Aimage\/.*\Z/

    # Required for geocoding
    def full_address
      address
    end

    def first_image_url
      self.images.empty? ? nil : self.images.first.image_url
    end

    def thumb_image_url
      self.cell_image.url if self.cell_image
    end

    def small_thumb_image_url
      self.thumbnail_image.image.url if self.thumbnail_image
    end

    DESC_HTML_BEG = "<div style=\"font-family:helvetica;color:white;\" >\r\n<p>\r\nDetails\r\n</p>\r\n<hr>\r\n<div>\r\n<ul>\r\n"
    DESC_HTML_END = "\r\n</div>\r\n</div>"

    #don't return quote if it is not a day of request
    def description(days_in_future)
      base = "#{description_1}#{description4}#{description2}#{description3}"
      base = "#{self.special_event_html}#{base}" unless self.tonights_event.nil? if days_in_future == 0
      base = "#{self.bt_offer_html}#{base}" unless self.bt_offer.nil?

      "#{DESC_HTML_BEG}#{base}#{DESC_HTML_END}"
    end

    def special_event_html
      return_li_html("SPECIAL EVENT: #{self.tonights_event}")
    end

    def bt_offer_html
      return_li_html("BT EXCLUSIVE OFFER: Free #{self.bt_offer.inventory.master_inventory.brand} #{ self.bt_offer.inventory.master_inventory.name } with every order!")
    end

    #returns string wrapped in li html style
    def return_li_html(s)
      "<li style=\"padding-bottom:1ex\">#{s}</li>"
    end

    def merchant_cut
      ((100 - self.bt_cut) * 0.01).to_f
    end

    def discount
      (self.percent_discount * 0.01).to_f
    end

    #returns club type string based off database int column club_type
    def club_type_string
      CLUB_TYPES[self.club_type]
    end

    #returns club types
    def self.club_types
      CLUB_TYPES
    end

    def get_bt_merchant_id
      "#{ name.gsub(/[^0-9a-z\\s]/i, '').downcase }_#{ self.subdivision.locale_code }_#{ self.subdivision.postal_code }"
    end
  end

这将返回值是否存在于“A”列中,如何搜索每个列以及存在该值的行。 数据集:

df[df['A'].str.contains("hello")]

如果我搜索“hello”或“Hello”输出应该是:

A           B           C
1           hi          hie
2           bye         Hello

2 个答案:

答案 0 :(得分:4)

我认为你可以使用:

df = pd.DataFrame({'A':['hello fgf','s','f'],'B':['d','ff hello','f'],'C':[4,7,8]})
print (df)
           A         B  C
0  hello fgf         d  4
1          s  ff hello  7
2          f         f  8

mask = df.applymap(lambda x: 'hello' in str(x))
print (mask)
       A      B      C
0   True  False  False
1  False   True  False
2  False  False  False

然后,如果需要过滤添加any,则每行boolean indexing检查至少一个True

df1 = df[mask.any(axis=1)]
print (df1)
           A         B  C
0  hello fgf         d  4
1          s  ff hello  7

编辑:

tested = 'hello'
mask = df.applymap(lambda x:  tested.lower() in str(x).lower())
print (mask)
       A      B      C
0  False  False  False
1  False  False   True

答案 1 :(得分:1)

您还可以将所有列连接成一个字符串,并在连接字符串中搜索您的子字符串:

In [21]: df[df.astype(str).add('|').sum(1).str.contains('hello')]
Out[21]:
           A         B  C
0  hello fgf         d  4
1          s  ff hello  7

说明:

In [22]: df.astype(str).add('|').sum(1)
Out[22]:
0    hello fgf|d|4|
1     s|ff hello|7|
2            f|f|8|
dtype: object