我有一个JSON文件,其中填充了来自抓取网站的数据。重复数据的次数比通常多,例如,我提供了JSON文件的摘要。是否可以删除重复项并保留第一次出现? 更新了我的完整代码。如果那样的话。
# grabs all the trending quotes for that day
def getTrendingQuotes(browser):
# wait until trending links appear, not really needed only for example
all_trendingQuotes = WebDriverWait(browser, 10).until(
lambda d: d.find_elements_by_css_selector('#trendingQuotes a')
)
return [link.get_attribute('href') for link in all_trendingQuotes]
def getStockDetails(url, browser):
print(url)
browser.get(url)
quote_wrapper = browser.find_element_by_css_selector('div.quote-wrapper')
quote_name = quote_wrapper.find_element_by_class_name(
"quote-name").find_element_by_tag_name('h2').text
quote_price = quote_wrapper.find_element_by_class_name("quote-price").text
quote_volume = quote_wrapper.find_element_by_class_name(
"quote-volume").text
print("\n")
print("Quote Name: " + quote_name)
print("Quote Price: " + quote_price)
print("Quote Volume: " + quote_volume)
print("\n")
convertToJson(quote_name, quote_price, quote_volume, url)
quotesArr = []
# Convert to a JSON file
def convertToJson(quote_name, quote_price, quote_volume, url):
quoteObject = {
"url": url,
"Name": quote_name,
"Price": quote_price,
"Volume": quote_volume
}
quotesArr.append(quoteObject)
def trendingBot(url, browser):
browser.get(url)
trending = getTrendingQuotes(browser)
for trend in trending:
getStockDetails(trend, browser)
# requests finished, write json to file
with open('trendingQuoteData.json', 'w') as outfile:
json.dump(quotesArr, outfile)
def Main():
scheduler = BlockingScheduler()
chrome_options = Options()
chrome_options.add_argument("--headless")
# applicable to windows os only
chrome_options.add_argument('--disable-gpu')
url = 'https://www.tmxmoney.com/en/index.html'
browser = webdriver.Chrome(
chrome_options=chrome_options)
# browser = webdriver.Chrome(
# r"C:\Users\austi\OneDrive\Desktop\chromeDriver\chromedriver_win32\chromedriver.exe")
browser.get(url)
os.system('cls')
print("[+] Success! Bot Starting!")
scheduler.add_job(trendingBot, 'interval', hours=1,
next_run_time=datetime.now(), args=[url, browser])
scheduler.start()
#trendingBot(url, browser)
browser.quit()
if __name__ == "__main__":
Main()
[
{
"url": "https://web.tmxmoney.com/quote.php?qm_symbol=ACB&locale=EN",
"Volume": "Volume:\n12,915,903",
"Price": "$ 7.67",
"Name": "Aurora Cannabis Inc."
},
{
"url": "https://web.tmxmoney.com/quote.php?qm_symbol=HNL&locale=EN",
"Volume": "Volume:\n548,038",
"Price": "$ 1.60",
"Name": "Horizon North Logistics Inc."
},
{
"url": "https://web.tmxmoney.com/quote.php?qm_symbol=ACB&locale=EN",
"Volume": "Volume:\n12,915,903",
"Price": "$ 7.67",
"Name": "Aurora Cannabis Inc."
}
]
答案 0 :(得分:0)
如果您将字典中的所有实体都视为重复的实体,则可以采用以下解决方案:
# Initializing `file` variable with your list
names = []
idx = []
new_file = []
for i in range(len(file)):
if file[i]['Name'] not in names:
names.append(file[i]['Name'])
idx.append(i)
new_file.append(file[i])
print(names)
print(idx)
new_file
['Aurora Cannabis Inc.', 'Horizon North Logistics Inc.']
[0, 1]
[{'Name': 'Aurora Cannabis Inc.',
'Price': '$ 7.67',
'Volume': 'Volume:\n12,915,903',
'url': 'https://web.tmxmoney.com/quote.php?qm_symbol=ACB&locale=EN'},
{'Name': 'Horizon North Logistics Inc.',
'Price': '$ 1.60',
'Volume': 'Volume:\n548,038',
'url': 'https://web.tmxmoney.com/quote.php?qm_symbol=HNL&locale=EN'}]
答案 1 :(得分:-1)
如果您想以Python的方式执行此操作...
arr = [
{
"url": "https://web.tmxmoney.com/quote.php?qm_symbol=ACB&locale=EN",
"Volume": "Volume:\n12,915,903",
"Price": "$ 7.67",
"Name": "Aurora Cannabis Inc."
},
{
"url": "https://web.tmxmoney.com/quote.php?qm_symbol=HNL&locale=EN",
"Volume": "Volume:\n548,038",
"Price": "$ 1.60",
"Name": "Horizon North Logistics Inc."
},
{
"url": "https://web.tmxmoney.com/quote.php?qm_symbol=ACB&locale=EN",
"Volume": "Volume:\n12,915,903",
"Price": "$ 7.67",
"Name": "Aurora Cannabis Inc."
}
]
def drop_duplicates(arr):
""" Appends the item to the returned array only if not
already present in our dummy array that serves as reference.
"""
selected = []
urls = []
for item in arr:
if item['url'] not in urls:
selected.append(item)
urls.append(item['url'])
return selected
print( drop_duplicates(arr) )