Source code for taq_data_analysis_statistics

'''TAQ data analysis module.

The functions in the module analyze the statistics from the NASDAQ stock
market.

This script requires the following modules:
    * itertools
    * multiprocessing
    * numpy
    * pandas
    * taq_data_tools_statistics

The module contains the following functions:
    * taq_quotes_trades_day_statistics_data - statistics of quotes and trades
      for a day.
    * taq_quotes_trades_year_statistics_data - statistics of quotes and trades
      for a year.
    * taq_midpoint_day_statistics_data - statistics midpoint price day
    * taq_midpoint_year_statistics_data - statistics midpoint price year
    * main - the main function of the script.

.. moduleauthor:: Juan Camilo Henao Londono <www.github.com/juanhenao21>
'''

# ----------------------------------------------------------------------------
# Modules

from itertools import product as iprod
import multiprocessing as mp
import numpy as np
import pandas as pd

import taq_data_tools_statistics

# ----------------------------------------------------------------------------


[docs]def taq_quotes_trades_day_statistics_data(ticker, date): """Obtain the quotes and trades statistics for a day. Using the quotes files, obtain the statistics of the average spread, number of quotes and number of trades for a day. :param ticker: string of the abbreviation of the stock to be analyzed (i.e. 'AAPL'). :param date: string with the date of the data to be extracted (i.e. '2008-01-02'). :return: tuple -- The function returns a tuple with float values. """ date_sep = date.split('-') year = date_sep[0] month = date_sep[1] day = date_sep[2] try: # Load data data_quotes = pd.read_hdf(f'../../taq_data/hdf5_daily_data_{year}/taq' + f'_{ticker}_quotes_{date}.h5', key='/quotes', columns=['Bid', 'Ask']) data_trades = pd.read_hdf(f'../../taq_data/hdf5_daily_data_{year}/taq' + f'_{ticker}_trades_{date}.h5', key='/trades', columns=['Ask']) # Some files are corrupted, so there are some zero values that does not # have sense condition_quotes = data_quotes['Ask'] != 0 data_quotes = data_quotes[condition_quotes] condition_trades = data_trades['Ask'] != 0 data_trades = data_trades[condition_trades] spread = (data_quotes['Ask'] - data_quotes['Bid']) / 10000 num_quotes = len(data_quotes) num_trades = len(data_trades) avg_spread = np.mean(spread) return (num_quotes, num_trades, avg_spread) except FileNotFoundError as e: print('No data') print(e) print() return (np.NaN, np.NaN, np.NaN)
# ----------------------------------------------------------------------------
[docs]def taq_quotes_trades_year_statistics_data(tickers, year): """Obtain the quotes and trades statistics for a year. Using the taq_quotes_trades_day_statistics_data function computes the statistics of the average spread, number of quotes and number of trades for a year. :param tickers: list of the string abbreviation of the stocks to be analyzed (i.e. ['AAPL', 'MSFT']). :param year: string of the year to be analyzed (i.e '2016'). :return: None -- The function saves the data in a file and does not return a value. """ function_name = taq_quotes_trades_year_statistics_data.__name__ # Create a file to save the info file = open('../taq_quotes_trades_year_statistics_data.csv', 'a+') file.write('Ticker, avg_quotes, avg_trades, avg_spread\n') for ticker in tickers: taq_data_tools_statistics \ .taq_function_header_print_data(function_name, ticker, ticker, year, '', '') dates = taq_data_tools_statistics.taq_bussiness_days(year) stat = [] args_prod = iprod([ticker], dates) # Parallel computation of the statistics. Every result is appended to # a list with mp.Pool(processes=mp.cpu_count()) as pool: stat.append(pool.starmap(taq_quotes_trades_day_statistics_data, args_prod)) # To obtain the average of the year, I average all the results of the # corresponding values (number quotes, trades and avg spread) stat_year = np.nanmean(stat[0], axis=0) # Write data in file file.write(f'{ticker}, {stat_year[0]:.0f}, {stat_year[1]:.0f},' + f' {stat_year[2]:.2f}\n') file.close return None
# ----------------------------------------------------------------------------
[docs]def taq_midpoint_day_statistics_data(ticker, date): """Obtain the midpoint price statistics for a day. Using the quotes files, obtain the midpoint price and the percentage of change between the last midpoint price in a second and the average midpoint price of the second. :param ticker: string of the abbreviation of the stock to be analyzed (i.e. 'AAPL'). :param date: string with the date of the data to be extracted (i.e. '2008-01-02'). :return: tuple -- The function returns a tuple with float values. """ date_sep = date.split('-') year = date_sep[0] month = date_sep[1] day = date_sep[2] try: # Load data data_quotes = pd.read_hdf(f'../../taq_data/hdf5_daily_data_{year}/taq' + f'_{ticker}_quotes_{date}.h5', key='/quotes', columns=['Time', 'Bid', 'Ask']) # Some files are corrupted, so there are some zero values that does not # have sense condition_quotes = data_quotes['Ask'] != 0 data_quotes = data_quotes[condition_quotes] midpoint = ((data_quotes['Ask'] + data_quotes['Bid']) / 2) / 10000 midpoint_e = 0 time_set = set(data_quotes['Time']) for t in time_set: condition = data_quotes['Time'] == t midpoint_mean = np.mean(midpoint[condition]) midpoint_last = midpoint[condition][-1] midpoint_e += np.abs(midpoint_mean - midpoint_last) / midpoint_mean midpoint_error = midpoint_e / len(time_set) return midpoint_error except FileNotFoundError as e: print('No data') print(e) print() return np.NaN
# ----------------------------------------------------------------------------
[docs]def taq_midpoint_year_statistics_data(tickers, year): """Obtain the midpoint price statistics for a year. Using the taq_midpoint_day_statistics_data function computes the statistics of the last midpoint price in a second and the average midpoint of the second for a year. :param tickers: list of the string abbreviation of the stocks to be analyzed (i.e. ['AAPL', 'MSFT']). :param year: string of the year to be analyzed (i.e '2016'). :return: None -- The function saves the data in a file and does not return a value. """ function_name = taq_quotes_trades_year_statistics_data.__name__ # Create a file to save the info file = open('../taq_midpoint_year_statistics_data.csv', 'a+') file.write('Ticker, Difference\n') for ticker in tickers: taq_data_tools_statistics \ .taq_function_header_print_data(function_name, ticker, ticker, year, '', '') dates = taq_data_tools_statistics.taq_bussiness_days(year) stat = [] args_prod = iprod([ticker], dates) # Parallel computation of the statistics. Every result is appended to # a list with mp.Pool(processes=mp.cpu_count()) as pool: stat.append(pool.starmap(taq_midpoint_day_statistics_data, args_prod)) # To obtain the average of the year, I average all the results of the # corresponding value stat_year = np.nanmean(stat[0], axis=0) # Write data in file file.write(f'{ticker}, {stat_year}\n') file.close return None
# ----------------------------------------------------------------------------
[docs]def main(): """The main function of the script. The main function is used to test the functions in the script. :return: None. """ pass return None
# ---------------------------------------------------------------------------- if __name__ == "__main__": main()