Commit 97acb5f3 authored by Michael Giansiracusa's avatar Michael Giansiracusa
Browse files

Lots of modifications. The one I just made is to deal with null values in the...

Lots of modifications. The one I just made is to deal with null values in the time_offset variable, and by deal with... I mean skip that file.
parent efe6eb96
......@@ -3,6 +3,7 @@
"""
Author: Michael Giansiracusa
Email: giansiracumt@ornl.gov
Version: 1.1.0
Purpose:
This module processes NetCDF files into ascii csf format.
......@@ -50,9 +51,9 @@ from glob import glob
from logging.handlers import RotatingFileHandler
# The following would override the logging configuration in netcdf2ascii, uncomment to test just this script.
# logging.basicConfig(level=logging.DEBUG,
# format='%(asctime)s %(levelname)s (%(funcName)s %(lineno)s) : %(message)s',
# datefmt='%Y-%m-%d %H:%M')
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s (%(funcName)s %(lineno)s) : %(message)s',
datefmt='%Y-%m-%d %H:%M')
nc2csv_logger = logging.getLogger("nc2csv")
# This was needed to import the netCDF4 library.
site_packages = "/apps/base/python3.5/lib/python3.5/site-packages"
......@@ -164,10 +165,10 @@ def nc2csv(args: argparse.Namespace) -> int :
# this is done here to make sure that the date_time variable is in the varlist for indexing and dqr filtering.
args.__setattr__('variables', format_variables(args.variables))
nc2csv_logger.info('Starting process pool.')
process_pool(args)
# process_pool(args)
# For debugging, multiprocessing will mask some errors.
# for file_name in args.files:
# process_one_file(args.variables, args.datastream, args.out_dir, args.DQRfilter, args.dqr_timeblocks, file_name)
for file_name in args.files:
process_one_file(args.variables, args.datastream, args.out_dir, args.DQRfilter, args.dqr_timeblocks, file_name)
nc2csv_logger.info('Dumping headers for {}'.format(args.datastream))
dump_header(args)
......@@ -204,7 +205,7 @@ def parse_varlist(var_list: str) -> list:
:param arg (str): The var_list input argument.
:return str: The path to the file containing the variables to extract or 'all' to denote extracting all variables
"""
if os.path.exists(var_list):
if var_list and os.path.exists(var_list):
if os.stat(var_list).st_size != 0:
return var_list
else:
......@@ -428,10 +429,25 @@ def process_one_file(variables: list, datastream: str, out_dir: str,
# create datetime column
if var == "date_time":
# FIXME This is the old way, much slower. the num2date can throw an index out of bounds.
# temp_datetimes = []
# for val in rootgrp["time_offset"][:]:
# temp_timestamp = np.asscalar(val + rootgrp["base_time"][0])
# temp_datetime = datetime.utcfromtimestamp(temp_timestamp).strftime("%Y-%m-%d %H:%M:%S")
# temp_datetimes.append(temp_datetime)
# df["date_time"] = temp_datetimes
# df.set_index("date_time", inplace=True)
# print("process_one_file: created date_time column")
# FIXME this is the better way but for some reason num2
# Vector opp to create pandas.__libs.tslibs.timestamps.Timestamp objects out of basetime and time offset
# This is possible without referencing the basetime because it is a parameter of the variable netCDF4.Dataset.variable object.
df["date_time"] = netCDF4.num2date(rootgrp.variables['time_offset'][:],
rootgrp.variables['time_offset'].units)
try:
df["date_time"] = netCDF4.num2date(rootgrp.variables['time_offset'][:],
rootgrp.variables['time_offset'].units,
calendar='standard')
except OverflowError as ofe:
nc2csv_logger.warning("{}: {} skipped.".format(ofe, file_name))
return None
nc2csv_logger.debug("created date_time column")
# Set the new column as the index so we can do date based slicing for dqr filtering later.
df.set_index("date_time", inplace=True)
......@@ -459,12 +475,18 @@ def process_one_file(variables: list, datastream: str, out_dir: str,
df[var] = rootgrp.variables[var][:]
elif dim_len == 2 and dim[0] == "time":
# if length of the dimensions is 2 then it's a 2d var with time
msg = "{} is 2 dimensional. Dims: {}".format(var, dim)
msg = "{} is 2 dimensional and will not be converted. Dims: {}".format(var, dim)
nc2csv_logger.warning(msg)
for dim_number in range(rootgrp.variables[var].get_dims()[1].size):
var_heading = "{}_{}".format(var, dim_number)
data = [d[dim_number] for d in rootgrp.variables[var][:].data]
df[var_heading] = data
# This would convert 2 dimensional variables into multiple columns but it was agreed we should not support this feature.
# This program needs to work on python version 3.5.0 which does not support multi-dimensional list comprehensions in ln 470.
# try:
# for dim_number in range(len(rootgrp.variables['concentration'].dimensions[1])):
# var_heading = "{}_{}".format(var, dim_number)
# nc2csv_logger.debug("var_heading: {}".format(var_heading))
# data = [d[dim_number] for d in rootgrp.variables[var][:].data]
# df[var_heading] = data
# except IndexError:
# pass
elif dim_len > 2:
# if length of dimensions > 2 then... I've not seen this yet - May 4th 2017
msg = "WARNING: {} in {} > 2 dimensional and will not be converted. Dims: {}".format(var, os.path.basename(file_name), dim)
......@@ -582,6 +604,7 @@ def merge_output(args: argparse.Namespace):
total_size = 0
max_size = 2147483648 # ~2.15 Gb
nc2csv_logger.debug('looping through tmp files: {}'.format(tmpfiles))
new_start_index = len(tmpfiles)-1
for i, f in enumerate(tmpfiles):
if os.path.getsize(f) > max_size:
nc2csv_logger.warning('No merging possible. Files to large.')
......@@ -597,7 +620,10 @@ def merge_output(args: argparse.Namespace):
for f in tmpfiles[:new_start_index]:
nc2csv_logger.debug('Merging {}'.format(f))
dataframes = csv_2_df_list(tmpfiles[:new_start_index])
concat_frames = pandas.concat(dataframes, sort=True)
concat_frames = pandas.concat(dataframes)
# This program needs to work on python version 3.5.0 in
# which the compatible version of pandas does not support the "sort" keyword
# concat_frames = pandas.concat(dataframes, sort=True)
outfile_path = create_output_path(args.out_dir, tmpfiles[:new_start_index])
nc2csv_logger.info("Output file --> {}".format(outfile_path))
......
......@@ -568,7 +568,7 @@ def get_launch_information(filename, in_dir):
try:
file_list = [f for f in os.listdir(in_dir)]
except FileNotFoundError as fnfe: #todo remove?
traceback.print_tb(fnfe, 2)
print("file not found when tryig to list contents of in_dir: {}".format(in_dir))
# In this case, the file we are splitting is assumed to be the normal ADI netCDF filename format,
# e.g. sgpaafccn200F1.a1.20160411.20582.nc
......@@ -722,6 +722,8 @@ def main(ncfile, vars, in_dir, out_dir):
# Read in the netCDF file that is to be converted.
datagrp = Dataset(ncfile, "r")
if 'all' in vars:
vars = datagrp.variables.keys()
dimensions, datalength = get_dimensions(datagrp)
......@@ -829,13 +831,15 @@ def nc2icartt(main_args):
files = [os.path.join(main_args.in_dir, x) for x in files]
print('opening {}'.format(main_args.var_list))
try:
if main_args.var_list:
with open(main_args.var_list) as open_file:
print('reading variables')
variables = open_file.readlines()
except [TypeError, FileNotFoundError]:
print("nc2icartt: File error - var_list = {}".format(main_args.var_list))
return(1)
else:
# todo make this default to all variables if no file found
variables = ['all']
# print("nc2icartt: File error - var_list = {}".format(main_args.var_list))
# return(1)
print('striping variables')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment