Commits (6)
......@@ -42,6 +42,7 @@ Output:
import os
import sys
import logging
import numpy
import pandas
import requests
import multiprocessing
......@@ -49,9 +50,10 @@ from functools import partial
from datetime import datetime
from glob import glob
from logging.handlers import RotatingFileHandler
from dateutil.parser import parse
# The following would override the logging configuration in netcdf2ascii, uncomment to test just this script.
logging.basicConfig(level=logging.DEBUG,
logging.basicConfig(level=logging.WARNING,
format='%(asctime)s %(levelname)s (%(funcName)s %(lineno)s) : %(message)s',
datefmt='%Y-%m-%d %H:%M')
nc2csv_logger = logging.getLogger("nc2csv")
......@@ -66,7 +68,6 @@ except (ImportError, ModuleNotFoundError) as e:
import argparse
help_description = """
This program converts netcdf files into csv files.
"""
......@@ -164,7 +165,7 @@ def nc2csv(args: argparse.Namespace) -> int :
# this is done here to make sure that the date_time variable is in the varlist for indexing and dqr filtering.
args.__setattr__('variables', format_variables(args.variables))
nc2csv_logger.info('Starting process pool.')
# nc2csv_logger.info('Starting process pool.')
# process_pool(args)
# For debugging, multiprocessing will mask some errors.
for file_name in args.files:
......@@ -429,26 +430,33 @@ def process_one_file(variables: list, datastream: str, out_dir: str,
# create datetime column
if var == "date_time":
# FIXME This is the old way, much slower. the num2date can throw an index out of bounds.
# temp_datetimes = []
# for val in rootgrp["time_offset"][:]:
# temp_timestamp = np.asscalar(val + rootgrp["base_time"][0])
# temp_datetime = datetime.utcfromtimestamp(temp_timestamp).strftime("%Y-%m-%d %H:%M:%S")
# temp_datetimes.append(temp_datetime)
# df["date_time"] = temp_datetimes
# df.set_index("date_time", inplace=True)
# print("process_one_file: created date_time column")
# FIXME this is the better way but for some reason num2
# Vector opp to create pandas.__libs.tslibs.timestamps.Timestamp objects out of basetime and time offset
# This is possible without referencing the basetime because it is a parameter of the variable netCDF4.Dataset.variable object.
try:
df["date_time"] = netCDF4.num2date(rootgrp.variables['time_offset'][:],
rootgrp.variables['time_offset'].units,
calendar='standard')
rootgrp.variables['time_offset'].units)
except OverflowError as ofe:
nc2csv_logger.warning("{}: {} skipped.".format(ofe, file_name))
return None
except (TypeError, IndexError, KeyError):
try:
df["date_time"] = netCDF4.num2date(rootgrp.variables['time'][:],
rootgrp.variables['time'].units)
except Exception:
try:
# TODO This is the old way, much slower but necessary because the num2date can throw an index out of bounds for an unknown reason.
temp_datetimes = []
for val in rootgrp["time_offset"][:]:
temp_timestamp = numpy.asscalar(val + rootgrp["base_time"][0])
temp_datetime = datetime.utcfromtimestamp(temp_timestamp).strftime("%Y-%m-%d %H:%M:%S")
temp_datetimes.append(temp_datetime)
df["date_time"] = temp_datetimes
nc2csv_logger.debug("Multiple errors while creating datetime column.")
except Exception as e:
nc2csv_logger.warning("{}: {} skipped.".format(e, file_name))
return None
nc2csv_logger.debug("created date_time column")
print(df.head())
# Set the new column as the index so we can do date based slicing for dqr filtering later.
df.set_index("date_time", inplace=True)
nc2csv_logger.debug("Set date_time column as index.")
......@@ -467,12 +475,20 @@ def process_one_file(variables: list, datastream: str, out_dir: str,
# create list of correct length and write the list to
# pandas.Dataframe with column name of current var
nc2csv_logger.debug("{} is constant".format(var))
df[var] = [rootgrp.variables[var][0]] * len(rootgrp.variables["time_offset"])
try:
df[var] = [rootgrp.variables[var][0]] * len(rootgrp.variables["time_offset"])
except TypeError as te:
nc2csv_logger.warning("{}:{} no length, probably a descriptive variable, skipping.".format(te,var))
except ValueError as ve:
nc2csv_logger.warning("{}:{} - skipping".format(ve, var))
# if the len of dimension is 1 then it's a time series
elif dim_len == 1 and dim[0] == "time":
# write the list to pandas.Dataframe with column name of current var
nc2csv_logger.debug("{} is 1 dimension".format(var))
df[var] = rootgrp.variables[var][:]
try:
df[var] = rootgrp.variables[var][:]
except ValueError as ve:
nc2csv_logger.warning("{}:{} - skipping".format(ve, var))
elif dim_len == 2 and dim[0] == "time":
# if length of the dimensions is 2 then it's a 2d var with time
msg = "{} is 2 dimensional and will not be converted. Dims: {}".format(var, dim)
......@@ -529,8 +545,11 @@ def process_one_file(variables: list, datastream: str, out_dir: str,
nc2csv_logger.debug("Finished dqr filtering for {}".format(file_name))
# Get the first valid index, which is a Timestamp object, and format it.
start_time = df.first_valid_index().__format__("%Y%m%d.%H%M%S")
nc2csv_logger.debug('first_valid_index = {}({})'.format(type(start_time), start_time))
nc2csv_logger.debug('first_valid_index = {}({})'.format(type(df.first_valid_index()), df.first_valid_index()))
try:
start_time = df.first_valid_index().__format__("%Y%m%d.%H%M%S")
except ValueError:
start_time = parse(df.first_valid_index()).strftime('%Y%m%d.%H%M%S')
# get the first 4 parts of the first file and join them on a period and add new ending
outbase = "{}.{}.custom.csv".format(datastream, start_time)
......