Source code for neonwranglerpy.utilities.stackdatafiles

"""Stack the data files according to table_types."""
import os.path
import pandas as pd
from neonwranglerpy import get_data
import neonwranglerpy.utilities.tools as tl
import neonwranglerpy.utilities.utils as ut


[docs]def load_table_types(dpID: str): """Return the dataframe about the table types of Data Products.""" stream = get_data('table_types.csv') df = pd.read_csv(stream) table_types = df[df['productID'] == dpID] table = table_types.reset_index(drop=True) return table
[docs]def stackdatafiles(folder_path, dst, dpID, stack_df=False): """Stack the data files according to table_types.""" if not os.path.exists(folder_path): print(f"{folder_path} does not exists") return None filenames = tl.get_all_files(folder_path) filepaths = tl.get_all_files(folder_path, dir_name=True) variables_list = [_ for _ in filepaths if "variables.20" in _] validation_list = [s for s in filepaths if "validation" in s] codes_list = [s for s in filepaths if "categoricalCodes" in s] stackedpath = os.path.join(dst, 'stackedFiles') # getting the table types from table_types.csv table_types = load_table_types(dpID) # getting the table types from files a_names = set([s.split('.')[6] for s in filenames]) t_names = [s for s in a_names if '_' in s] t_filter = table_types['tableName'].isin(t_names) tables = table_types[t_filter] table_names = tables.tableName # copy varibles and validation files to /stackedfiles using the most # recent publication date if variables_list: # get most recent variable file varpath = ut.get_recent_publications(variables_list) # get variables from the files variables = ut.get_variables(varpath) if not os.path.exists(stackedpath): print('creating stackedFiles directory') os.makedirs(stackedpath) if validation_list: valpath = ut.get_recent_publications(validation_list) validation_dst = os.path.join(stackedpath, f"validation_{dpID}.csv") tl.copy_zip(valpath, validation_dst) print("copying the most recent publication of validation file to /stackedFiles") if codes_list: codepath = ut.get_recent_publications(codes_list) code_dst = os.path.join(stackedpath, f"categoricalCodes_{dpID}.csv") tl.copy_zip(codepath, code_dst) print("copying the most recent publication of categoricalCodes file to" "/stackedFiles") out = {} # stacking the files for i in range(len(table_names)): file_list = sorted([file for file in filepaths if table_names[i] in file]) temp_files = [] stacking_list = [] if tables.tableType[i] == "site-date": temp_files = file_list if tables.tableType[i] == "site-all": base_files = [os.path.basename(name) for name in file_list] sites = set([s.split(".")[2] for s in base_files]) for _ in sites: site_list = [s for s in file_list if _ in s] site = ut.get_recent_publications(site_list) temp_files.append(site) for _ in temp_files: df = pd.read_csv(_) stacking_list.append(df) stacked_df = pd.concat(stacking_list, axis=0) df_save_path = os.path.join(stackedpath, f"{table_names[i]}_{dpID}.csv") if stack_df: out[table_names[i]] = stacked_df stacked_df.to_csv(df_save_path, index=False) out['variables'] = variables out['stackedpath'] = stackedpath return out