import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pandas as pd
import numpy as np
import os
import re
import json
import shutil
working_dir = 'Path/To/Your/working/directory'
os.chdir(working_dir)
# Ingest the configuration file
with open("./data/config.json","r") as f:
config = json.load(f)
# Extract global parameters
nasdaq_data_filename = config["nasdaq_data_filename"]
download_folder = config["download_folder"]
output_folder = config["output_folder"]
landing_base = config["landing_base"]
chromedriver_path = config["chromedriver_path"]
xpath_X = config["xpath_X"]
xpath_5Y = config["xpath_5Y"]
xpath_download = config["xpath_download"]
max_waiting_time = config['max_waiting_time']
# Read Nasdaq ticker symbols
# Downloaded from: https://www.nasdaq.com/market-activity/stocks/screener
nasdaq_data = pd.read_csv('./data/'+nasdaq_data_filename)
nasdaq_symbols = nasdaq_data["Symbol"].tolist()
# Utility function to launch chrome web driver
def launchChromeDriver(chromedriver = chromedriver_path):
global driver
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("--disable-infobars")
options.add_argument("--disable-gpu")
options.add_argument("--disable-browser-side-navigation")
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(chrome_options=options,
executable_path = chromedriver)
# Utility function to perform clicks
def clickAction(landing_page,
waiting_time = 30,
xpathX = xpath_X,
xpath5Y = xpath_5Y,
xpathdownload = xpath_download):
global driver
driver.get(landing_page)
driver.maximize_window()
driver.implicitly_wait(waiting_time) # Website has banner ads that takes time fully loading
# Find the X to close the pop up window
element = driver.find_element_by_xpath(xpathX)
element.click() # Click to "X" button
# Find the "5Y" button to download the last 5Ys of data
element = driver.find_element_by_xpath(xpath5Y)
element.click() # Click to "5Y" button
# Find the Download button
element = driver.find_element_by_xpath(xpathdownload)
# Click to download entire stock time series data
element.click() # Click to "Download" button
# Iterate over Nasdaq symbols to download time-series stock data
nasdaq_tickers = nasdaq_data["Symbol"].str.lower().tolist()
# Main download loop
if __name__ == "__main__":
for ticker in nasdaq_tickers:
launchChromeDriver()
continue_next = True
print("*" * 75)
print('Processing NASDAQ ticker: '+ ticker.upper())
try:
# First check if downloads folder have historical files remaining from earlier attempts
dir_list = os.listdir(download_folder)
search_list = [re.findall("^HistoricalData.*\\.csv$",x) for x in dir_list]
search_list = [x[0] for x in search_list if x != []]
# If any leftover found, delete them
if len(search_list) > 0:
for item in search_list:
os.unlink(download_folder + item)
print("Cleaned existing file...")
# Attept to download new file
search_list = []
waitingtime = 30
while len(search_list) < 1:
try:
clickAction(landing_page = landing_base + ticker +"/historical",
waiting_time = waitingtime)
dir_list = os.listdir(download_folder)
search_list = [re.findall("^HistoricalData.*\\.csv$",x) for x in dir_list]
search_list = [x[0] for x in search_list if x != []]
print(search_list)
except:
dir_list = os.listdir(download_folder)
search_list = [re.findall("^HistoricalData.*\\.csv$",x) for x in dir_list]
search_list = [x[0] for x in search_list if x != []]
if len(search_list) > 0:
break
else:
print("Unsucessful clickAction")
driver.quit()
launchChromeDriver()
waitingtime += 30
print("Still trying to find the target to click...increasing waiting time to: " + str(waitingtime))
if waitingtime <= max_waiting_time:
pass
else:
continue_next = False
break
except:
print("Executed 4")
pass
if continue_next:
# Find and rename the downloaded file
print("Trying to download the file...")
while len(search_list) < 1:
dir_list = os.listdir(download_folder)
search_list = [re.findall("^HistoricalData.*\\.csv$",x) for x in dir_list]
search_list = [x[0] for x in search_list if x != []]
print("Still waiting file to download...")
# Copy the file into storage location with appropriate ticker symbol
src = download_folder + search_list[0]
target = "./data/historical_data/" + ticker.upper() + ".csv"
shutil.copyfile(src,target)
os.unlink(src) # Delete the file from downloads
print('Processed NASDAQ ticker: '+ ticker.upper())
driver.quit()
else:
print('Skipping NASDAQ ticker: '+ ticker.upper())