nc2icartt.py 38.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
import copy
import json
import math
import os
import sys
import argparse
import urllib.request, urllib.error, urllib.parse

sys.path.insert(0, "/apps/base/python3.5/lib/python3.5/site-packages")

import pandas as pd
import numpy as np

from collections import OrderedDict
from datetime import datetime
from datetime import timedelta
from netCDF4 import Dataset


# Mapping of the 'facility_id' to the corresponding platform
PLATFORM_MAP = {
    "F1": "Department of Energy ARM Aerial Facility Gulfstream",
    "F2": "ArcticShark UAS",
    "F3": "Datahawk UAS"
}

# Map the SOLR API's IOP long names to their respective ICARTT descriptions.
# Need to keep in mind that for campaigns that cross DST, need to include information for both when DST is active and
# when it is not in effect (Start time, UTC offset, end time, UTC offset).
CAMPAIGN_LOCATION_MAP = {
    "Holistic Interactions of Shallow Clouds, Aerosols, and Land-Ecosystems (HI-SCALE)":
        "Flights based out of Bartlesville, OK.",
    "Aerosol and Cloud Experiments in the Eastern North Atlantic (ACE-ENA)":
        "Flights based out of Lajes Portugal in the Azores. Measurements focused on DOE ARM ENA site."
}

DEFAULT_HEADER = OrderedDict([
            ("Numberlines", 0),
            ("Name", ""),
            ("Organization", ""),
            ("Data_source_description", ""),
            ("Mission_name", ""),
            ("Filenum_Numfile", (1, 1)),
            ("Databegin_rev", ""),
            ("Data_interval", 0),
            ("Independent_Variable", ""),
            ("NumOfVar", 0),
            ("Scale", 1),
            ("Missing_data", -99999),
            ("Variables", OrderedDict()),
            ("NumberOfSpecialComments", 0),
            ("Special_comments", ""),
            ("NumberOfNormalComments", 18),
            ("Normal_comments", "")])

DEFAULT_NORMAL_COMMENTS = OrderedDict([
                    ("PI_CONTACT_INFO", ""),
                    ("PLATFORM", ""),
                    ("LOCATION", ""),
                    ("ASSOCIATED_DATA", ""),
                    ("INSTRUMENT_INFO", ""),
                    ("DATA_INFO", ""),
                    ("UNCERTAINTY", ""),
                    ("ULOD_FLAG", ""),
                    ("ULOD_VALUE", ""),
                    ("LLOD_FLAG", ""),
                    ("LLOD_VALUE", ""),
                    ("DM_CONTACT_INFO", ""),
                    ("PROJECT_INFO", ""),
                    ("STIPULATIONS_ON_USE", ""),
                    ("OTHER_COMMENTS", ""),
                    ("REVISION", ""),
                    ("R#", "")])

DEFAULT_REVISION_INFO = []


def time_difference(d2, d1, timed, description, unit):
    s2 = datetime.strptime(d2, "%Y-%m-%d %H:%M:%S")  # the day when this data starts
    s1 = datetime.strptime(d1,"%Y-%m-%d %H:%M:%S")  # the day when the sampling started 1970 0101
    end_time = s2 + timed
    start_end = ",".join([str(s2.year), str(s2.month).zfill(2), str(s2.day).zfill(2), str(end_time.year),
                          str(end_time.month).zfill(2), str(end_time.day).zfill(2)])
    total_seconds = (s2 - s1).total_seconds()
    result = {"seconds": total_seconds,
              "minutes": total_seconds / 60,
              "hours": total_seconds / 3600,
              "days": total_seconds / 86400}

    return result[unit], start_end


def get_basetime(datagroup):
    variable = datagroup.variables["base_time"]  # base_time is supposed to be in variables.
    dim, datalength = get_dimensions(datagroup)
    di, diunit, uniform = get_data_interval(datagroup)

    kwargs = {diunit: di * datalength}
    timed = timedelta(**kwargs)

    startstr = variable.string.split()
    basestr = variable.units.split()
    unit = basestr[0]
    description = " ".join(basestr[1:4])
    start = " ".join(startstr[0:2])
    base = " ".join(basestr[2:4])

    return time_difference(start, base, timed, description, unit)


def get_lines(header):
    """Takes a header dictionary and builds a string with all of the header information, ready to be written to file."""
    header_copy = copy.deepcopy(header)
    # If there are no special comments, remove the entry from the header that is being built
    if 'Special_comments' in list(header_copy.keys()) and header_copy['Special_comments'] == '':  # no special comments
        header_copy.pop('Special_comments', 0)
    finalized_header = "\n".join([str(header_copy[k]) for k in list(header_copy.keys())])

    return finalized_header


def get_dimensions(datagroup):
    """Get the dimensions for the given datagroup.  Returns a tuple of the list of dimension objects and the integer
    count of the dimensions."""
    dims = datagroup.dimensions
    length = 0
    for name in dims:
        length = len(dims[name])
        break
    return dims, length


def get_variables(datagroup):
    """Get the list of variables for the datagroup."""
    return datagroup.variables


def get_global_attr(datagroup):
    """Get the list of global attributes for the datagroup."""
    return datagroup.nvattrs()


def get_normal_comment_platform(datagroup):
    """Returns the platform name for the given facility_id.  If the platform name cannot be distinguished by
    the given facility_id, returns an empty string instead."""
    # TODO: Get Confirmation
    if datagroup.facility_id in list(PLATFORM_MAP.keys()):
        return PLATFORM_MAP[datagroup.facility_id]
    else:
        return ""


def get_normal_comment_location(datagroup, header):
    """Checks to see if the header contains a mission name, and if it does, uses checks the mission name against the
    mapping between mission name and location. Then gets location information from the global attributes on the given
    datagroup and creates a string of the format "lat x.x,lon y.y, alt z.z" and appends it to the result if it exists.
    Ignores lat/lon/alt that exist, but have missing values.  If a mission cannot be determined or if the mission found
    is not in the lookup table, we instead set the location information to the contents of the netCDF attribute
    "location description".  At the very end of the string, a disclaimer stating that users should contact the AAF if
    they want local time information.

    Parameters
    ----------
    datagroup: netCDF4.dataset
        netCDF object the location information is to be pulled from.
    header: dictionary
        Header for the ICARTT file format.  Attempts to use the key 'Mission_name'.

    Returns
    -------
    Location string, containing three elements:

    Mission location information, or the netCDF file's location_description if the mission location information cannot
    be determined.

    "lat x.x, lon y.y, alt z.z" Where x.x and such represent the values lat, lon and alt.  If one of the attributes is
    missing from the netCDF file, it is omitted from the string, e.g. "lat x.x, lon y.y" (no altitude).

    "Contact the AAF for local time information."

    Example output: "Flights out of Bartlesville, OK. ARM Site Coordinates: lat 50.0, lon -50.0, alt 150.
    Contact the AAF for local time information."
    """
    # TODO: It seems that getting lat,lon,alt from here is unreliable?
    mission_name = header.get("Mission_name")

    if mission_name is not None:
        if mission_name in list(CAMPAIGN_LOCATION_MAP.keys()):
            flight_location = CAMPAIGN_LOCATION_MAP[mission_name]
        else:
            flight_location = datagroup.location_description + "."
    else:
        flight_location = datagroup.location_description + "."

    coord_location = list()
    variables = datagroup.variables
    # If an entry is masked, that means it's value is missing, and as such not a valid value
    if 'lat' in variables and variables['lat'].size == 1 and not np.ma.is_masked(variables["lat"][:]):
        coord_location.append("lat %s" % (str(variables['lat'][:])))

    if 'lon' in variables and variables['lon'].size == 1 and not np.ma.is_masked(variables["lon"][:]):
        coord_location.append("lon %s" % (str(variables['lon'][:])))

    if 'alt' in variables and variables['alt'].size == 1 and not np.ma.is_masked(variables["alt"][:]):
        coord_location.append("alt %s" % (str(variables['alt'][:])))

    if len(coord_location) > 0:
        coordinate_string = " ARM site coordinates: " + ",".join(coord_location) + "."
    else:
        coordinate_string = ""

    local_time_disclaimer = " Contact the AAF for local time information."

    finalized_location = flight_location + coordinate_string + local_time_disclaimer

    return finalized_location


def get_normal_comment_associated_data(datagroup):
    """For now we don't care about the associated data comment. Returns empty string."""
    return ""


def get_normal_comment_instrument(datagroup):
    """For now, we don't care about the instrument comment. Returns empty string."""
    return ""


def get_normal_comment_data_info(datagroup):
    """Get a list of all the variable names and their units. Each entry of the form 'var_name units;'  Units without
    a defined unit will have 'N/A' instead."""
    # put the constant variables here
    vas = datagroup.variables
    constant_var = list()
    for k in vas:
        if vas[k].size == 1:
            constant_var.append(k + " " + vas[k].units if vas[k].units != 'unitless' else 'N/A')
    return ';'.join(constant_var)


def get_normal_comment_uncertainty(datagroup):
    """For now we don't care about the uncertainty comment.  However, this field cannot be left empty, so we return
    "'N/A'".  We include the single quotes, as the ICARTT file checker doesn't accept just "N/A" """
    return "'N/A'"


def get_normal_comment_dm_contact_info(datagroup):
    """Returns the generic AAF contact email address."""
    return "armaaf@arm.gov"


def get_normal_comment_project_info(datagroup):
    """For now we don't care about the project information.  Returns empty string."""
    return ""


def get_normal_comment_stipulations_for_use(datagroup):
    return "Use of this data should be done in consultation with the PI."


def get_normal_comment_other_comments(datagroup):
    """'Other comments' contains metadata information that the netCDF version of the file contains that the ICARTT
    format does not require.  We will replace any newline characters with ' - ' for now, as newlines will change the
    header length unexpectedly."""
    te = OrderedDict()
    for k in datagroup.ncattrs():
        te[k] = k + ":" + str(getattr(datagroup, k))
    return " ; ".join([te[k].replace("\n", " - ") for k in te])


def get_normal_comment_revision(datagroup, revision_info):
    """We use the history field in the netCDF file as the only history.  There isn't a good way to get older revisions
    in the current system."""
    # should store the history of revision and comments like R0 : rawdata  R1 : Final Data
    # Input revision_info is an OrderedDict
    num = 0
    if "history" in datagroup.ncattrs():
        comment = datagroup.history
    else:
        comment = "Final Data"
    current_revision_base = 'R%d' % num
    current_revision = current_revision_base + " : " + comment
    revision_info.insert(0, current_revision)


def get_normal_comments(datagroup, header, normal_comments, revision_info):
    """Build the 'normal comments' section of the ICARTT header in place."""
    # set PI_CONTACT_INFO
    normal_comments["PI_CONTACT_INFO"] = "PI_CONTACT_INFO:" + " Address: PNNL ; email: armaaf@arm.gov"

    normal_comments["PLATFORM"] = "PLATFORM:" + " " + get_normal_comment_platform(datagroup)

    normal_comments["LOCATION"] = "LOCATION:" + " " + get_normal_comment_location(datagroup, header)

    normal_comments["ASSOCIATED_DATA"] = "ASSOCIATED_DATA:" + " " + get_normal_comment_associated_data(datagroup)

    normal_comments["INSTRUMENT_INFO"] = "INSTRUMENT_INFO:" + " " + get_normal_comment_instrument(datagroup)

    normal_comments["DATA_INFO"] = "DATA_INFO:" + " " + get_normal_comment_data_info(datagroup)

    normal_comments["UNCERTAINTY"] = "UNCERTAINTY:" + " " + get_normal_comment_uncertainty(datagroup)

    normal_comments["ULOD_FLAG"] = "ULOD_FLAG:" + " " + str(-7777)
    normal_comments["ULOD_VALUE"] = "ULOD_VALUE: N/A"
    normal_comments["LLOD_FLAG"] = "LLOD_FLAG:" + " " + str(-8888)
    normal_comments["LLOD_VALUE"] = "LLOD_VALUE: N/A"

    normal_comments["DM_CONTACT_INFO"] = "DM_CONTACT_INFO:" + " " + get_normal_comment_dm_contact_info(datagroup)

    normal_comments["PROJECT_INFO"] = "PROJECT_INFO:" + " " + get_normal_comment_project_info(datagroup)

    normal_comments["STIPULATIONS_ON_USE"] = "STIPULATIONS_ON_USE:" + " " + get_normal_comment_stipulations_for_use(datagroup)

    normal_comments["OTHER_COMMENTS"] = "OTHER_COMMENTS:" + " " + get_normal_comment_other_comments(datagroup)

    # Should eventually remove reliance on modifying a global dictionary.
    get_normal_comment_revision(datagroup, revision_info)
    normal_comments["REVISION"] = "REVISION:" + " " + revision_info[0][0:2] if len(revision_info) > 0 else "R0"
    normal_comments['R#'] = '\n'.join(revision_info)
    normal_comments['R#'] += '\n'

    # Need to add 1 to account for the comma separated variables that specify the variable order of the form
    # independent_variable,variable1,variable2,variable3,...
    header["NumberOfNormalComments"] = len(normal_comments) + len(revision_info)

    header["Normal_comments"] = get_lines(normal_comments)


###################################
# functions for file header information
def get_start_end_date(datagroup):
    """Get the base"""
    if "base_time" in datagroup.variables:
        return get_basetime(datagroup)


def get_data_interval(datagroup):
    if "time" in datagroup.variables:
        variable = datagroup.variables['time']
        unit = variable.units.split()[0]
        uniform = True
        # If there is only one data point, just default to an interval = 1 and uniform = True
        if len(variable[:]) < 2:
            return 1, unit, True

        interval = variable[1] - variable[0]

        # We will also check, if the interval is one second or less, if the interval is uniform throughout the file.
        if interval <= 1:
            differences_between_elements = np.diff(variable[:])
            uniform = np.allclose(differences_between_elements, differences_between_elements[0])
        return interval, unit, uniform


def set_data_interval(datagroup):
    interval, interval_unit, uniform = get_data_interval(datagroup)
    # If the interval is not uniform, the interval is not reliable
    if interval > 1 or not uniform:
        return 0
    else:
        return interval


def get_independent_var(datagroup):
    # The independent variable for time series data should be 'start_time' in seconds
    return "start_time, " + "seconds"


def get_scale(header):
    return ", ".join(["1"] * int(header['NumOfVar']))


def get_missing_data(header, missing_value=-9999):
    # Default missing data value is -9999.  If the data is too negative, this may be adjusted later in the program.
    # ICARTT does not allow missing values that are within an order of magnitude of the most negative data point.
    unicode_missing_value = str(missing_value)
    return ", ".join([unicode_missing_value] * int(header['NumOfVar']))


def get_special_comments(datagroup, header):
    special_comments = list()
    # add any comments
    header['NumberOfSpecialComments'] = str(len(special_comments))
    header['Special_comments'] = '\n'.join(special_comments)


def get_mission_name(datagroup):
    """Returns the IOP long name associated with the data file, as defined in the SOLR API that ARM's data discovery
    tool uses.  If a mission name cannot be found for the instrument, returns instead the unicode string 'N/A' """
    # This function tries to use the data's datastream name and base_time to determine the name of the IOP the
    # data was gathered in.  Accesses the SOLR API at www.archive.arm.gov .
    data_date = datetime.utcfromtimestamp(datagroup.variables["base_time"][0])
    search_date = data_date.strftime("%Y-%m-%dT%H:%M:%SZ")

    datastream_name = datagroup.datastream
    # First query is to get the instrument class for the datastream for the date of the data
    connection = urllib.request.urlopen("http://ui1b.ornl.gov:8983/solr/browser_0/query?q=datastream:{}%20AND%20start_date:[*%20TO%20{}]%20AND%20end_date:[{}%20TO%20*]&wt=json".format(datastream_name, search_date, search_date))
    result = json.loads(connection.read().decode("utf-8"))
    records = result["response"]["docs"]
    if len(records) < 1:
        # No instrument class specified for data, so no chance of discovering mission name
        print("No instrument class found for datastream, cannot determine mission name. Setting to N/A")
        return "N/A"
    ic = records[0]["instrument_class"][0]

    # Then we check which IOPs use the instrument class on the data's date
    connection = urllib.request.urlopen(
        "http://ui1b.ornl.gov:8983/solr/fc/query?q=instrument_class:{}%20AND%20start_date:[*%20TO%20{}]%20AND%20end_date:[{}%20TO%20*]&wt=json&rows=100".format(
            ic, search_date, search_date))
    iops = json.loads(connection.read().decode("utf-8"))["response"]["docs"]

    if len(iops) > 0:
        # If not all the returned records have the same mission name, then it may not be possible to
        # determine the mission name with the current information. Error out.
        for record in iops:
            if record["iopLongName"] != iops[0]["iopLongName"]:
                print(("Mismatch: '{}' and '{}'".format(record["iopLongName"], iops[0]["iopLongName"])))
                print("Not all mission names found for datastream match.")
                print("Cannot determine correct mission name. Setting to N/A")
                return "N/A"
        return iops[0]["iopLongName"]
    else:
        # If a mission name could not be found, dataset may not be a part of an IOP
        return "N/A"


def get_data_source(datagroup):
    """Gets the data source from the input_datastreams attribute.  The input datastreams are separated by newlines,
    but this would break the rigid line length setup for the ICARTT header.  As such, we instead separate by ', ' and
    then return the resulting string.

    If there is no 'input_datastreams' attribute (as input_datastreams may not exist on a1 level streams), we instead
    return the file's 'datastream' and 'process_version'"""
    if "input_datastreams" in datagroup.ncattrs():
        return datagroup.input_datastreams.replace("\n", ", ")
    else:
        source_string = datagroup.datastream + " : " + datagroup.process_version
        return source_string


441
def set_variables(datagroup, header, vars):
442 443 444 445 446 447 448 449
    variables = get_variables(datagroup)

    results = OrderedDict()
    interval, interval_unit, uniform = get_data_interval(datagroup)
    if interval > 1 or not uniform:  # means we have data interval larger than 1 or non-uniform, so stop time is needed
        results["stop_time"] = "stop_time" + ', ' + interval_unit

    # We manually handle time, so we don't want time fields in our var_columns
450
    var_columns = [x for x in list(variables.keys()) if variables[x].size > 1 and x in vars]
451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490
    clean_var_columns = [column for column in var_columns if column not in ["time", "time_offset", "time_bounds"]]

    independent_var_dimension = variables[var_columns[0]].dimensions[0]
    for var_name in clean_var_columns:

        # It seems some data files have variables that do not have the 'units' attribute
        # We are only interested in variables that match the independent variable
        if independent_var_dimension in variables[var_name].dimensions:
            var_unit = 'N/A'
            if "units" not in variables[var_name].ncattrs() or variables[var_name].units == 'unitless':
                var_unit = 'N/A'
            else:
                var_unit = variables[var_name].units.split()[0]

            if len(variables[var_name].dimensions) >= 2:
                # This algorithm should always complement the corresponding unbinning in the data
                # Unbin and split

                if "bin_number" in variables[var_name].dimensions:
                    for x, y in zip(variables["lower_size_limit"], variables["upper_size_limit"]):
                        new_var_name = var_name + "_" + str(x) + "-" + str(y)
                        results[new_var_name] = new_var_name + ", " + var_unit
                elif "string" in variables[var_name].dimensions[1]:
                    # If the second dimension indicates the variable is a string, we will ignore it.
                    # ICARTT seems to have trouble with having strings in the time series data.
                    continue
                else:
                    second_dimension = variables[variables[var_name].dimensions[1]]
                    bound_variable = variables[second_dimension.bounds]
                    for bin_index in range(len(second_dimension)):
                        new_var_name = (var_name + "_" + str(bound_variable[bin_index, 0]) + "-"
                                        + str(bound_variable[bin_index, 1]))
                        results[new_var_name] = new_var_name + ", " + var_unit
                    continue

            else:
                results[var_name] = var_name + ", " + var_unit

    header['Variables'] = get_lines(results)
    header['NumOfVar'] = str(len(results))
491
    return clean_var_columns
492 493 494 495 496 497 498


def set_head_line_num(header):
    total = int(header['NumberOfNormalComments']) + int(header['NumberOfSpecialComments']) + int(header['NumOfVar']) + 14
    header['Numberlines'] = str(total) + ', ' + '1001'


499
def trans_header(datagroup, header, normal_comments, revision_info, vars):
500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525

    # set the name
    header["Name"] = "ARM Aerial Facility Team"

    # set the Organization
    header["Organization"] = "ARM PNNL"

    # set Data_source_description
    header["Data_source_description"] = get_data_source(datagroup)

    # set Mission_name
    header["Mission_name"] = get_mission_name(datagroup)

    # set file num and num of file
    header["Filenum_Numfile"] = "1,1"

    # set begin and end date
    basetime, header["Databegin_rev"] = get_start_end_date(datagroup)

    # set Data_interval
    header["Data_interval"] = set_data_interval(datagroup)

    # set Independent variable
    header["Independent_Variable"] = get_independent_var(datagroup)

    # set variables
526
    var_columns = set_variables(datagroup, header, vars)
527 528 529 530 531 532 533 534 535 536 537 538 539 540

    # set Missing data and scale, which depend on knowing the line number.  set_variables must be called first
    header["Missing_data"] = get_missing_data(header)
    header["Scale"] = get_scale(header)

    # set special Comments
    get_special_comments(datagroup, header)

    # set normal comments
    get_normal_comments(datagroup, header, normal_comments, revision_info)

    # finally we know the number of lines
    set_head_line_num(header)

541 542 543 544
    # if the order of the input variables and the header don't match the icartt file is wrong.
    # return the cleaned variable order so we can pass it along and maintain the consistency
    return var_columns

545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570

def get_launch_information(filename, in_dir):
    """Given a filename, get the launch information for that file by comparing the filename with the other files in the
    same directory.  For AAF datastreams, all data files collected in the same day represent different 'launches' for
    the same data stream.  We are working under the assumption that each data file for the AAF represents one launch.
    If this stops being true, we may need to add logic to determine the number of file volumes per launch.

    This function will assume all data files in the current directory that match on every field except time are all
    different launches for the same day, determining the total launches for the set and determining the launch number
    of the given filename.

    Parameters
    ----------
    filename: str
        The filename to determine launch information for.  Currently expects input to be adhering to ADI's netCDF file
        naming format, e.g.  sgpaafccn200F1.a1.20160411.20582.nc

    Returns
    -------
    Launch number for the filename

    """
    containing_dir = os.path.abspath(os.path.dirname(filename))
    try:
        file_list = [f for f in os.listdir(in_dir)]
    except FileNotFoundError as fnfe: #todo remove?
571
        print("file not found when tryig to list contents of in_dir: {}".format(in_dir))
572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724

    # In this case, the file we are splitting is assumed to be the normal ADI netCDF filename format,
    # e.g. sgpaafccn200F1.a1.20160411.20582.nc
    matching_files = []
    # This rework of the filename essentially strips out the time information from the filename.  This allows us to
    # match it to the other filenames, matching all files for the same instrument/site/facility/datalevel
    # that all occurred on the same day, ignoring the time.
    filename_without_time = ".".join(filename.split(".")[:2] + filename.split(".")[4:])

    for target_file in file_list:
        if len(target_file.split(".")) >= 5:
            # A target file needs at least 5 fields when split, otherwise it has no chance of matching
            target_filename_without_time = ".".join(target_file.split(".")[:2] + target_file.split(".")[4:])
            if target_filename_without_time == filename_without_time:
                matching_files.append(target_file)
        else:
            continue
    # Now that we know all the files in this list match except by the time field, sorting the names should result
    # in sorting by time field.
    matching_files.sort()
    launch_number = 1
    try:
        # List index is 0 indexed, but our launch counts are 1 indexed, hence the "+ 1"
        launch_number = matching_files.index(filename) + 1
    except ValueError:
        print("Should not be reached.  Issue with launch determination, given filename not in directory.")
        exit(1)

    return launch_number


def trans_data(var_columns, datalength, datagrp):
    variables = get_variables(datagrp)

    # We manually handle time, so we don't want time fields in our var_columns
    clean_var_columns = [column for column in var_columns if column not in ["time", "time_offset", "time_bounds"]]
    new_var_columns = []
    independent_var = None
    if "time_bounds" in list(variables.keys()):
        # If there are time bounds specified, start_time and stop_time are pulled from the bounds
        print("Time bounds found")
        independent_var = "time_bounds"
        data = variables["time_bounds"][:, 0] #TODO is this supposed to be step size? I think this will fail
        new_var_columns.append("start_time")
        # If the interval is greater than 1 second or is not uniform, we need a stop_time
        interval, interval_unit, uniform = get_data_interval(datagrp)
        if interval > 1 or not uniform:
            data = np.column_stack([data, variables["time_bounds"][:, 1]])
            new_var_columns.append("stop_time")
    else:
        # If there are no time bounds specified, start_time and stop_time are assumed to be the same, pulled from "time"
        independent_var = "time"
        data = variables["time"][:]
        new_var_columns.append("start_time")
        # If the interval is greater than 1 second or is not uniform, we need a stop_time
        interval, interval_unit, uniform = get_data_interval(datagrp)
        if interval > 1 or not uniform:
            # TODO Probably rip out the temp fix.  Seems converter doesn't like start_time == stop_time
            # "time" is the time offset from midnight, which is what ICARTT uses for 'start_time'
            stop_time_padding = interval / 1000.0
            data = np.column_stack([data, variables["time"][:] + stop_time_padding])
            new_var_columns.append("stop_time")

    # Every variable in the output file should have the same dimension as the independent variable
    independent_var_dimension = variables[independent_var].dimensions[0]
    for var in clean_var_columns:
        if var == 'stop_time':
            # If there is a stop_time variable, it should have already been handled.
            continue

        # We are only interested in variables that match the independent variable
        if independent_var_dimension in variables[var].dimensions:
            if len(variables[var].dimensions) >= 2:
                # This algorithm should always complement the corresponding unbinning in the header
                # Unbin and split
                if "bin_number" in variables[var].dimensions:
                    for x, y in zip(variables["lower_size_limit"], variables["upper_size_limit"]):
                        new_var_columns.append(var + "_" + str(x) + "-" + str(y))
                    data = np.column_stack([data, variables[var][:]])
                elif "string" in variables[var].dimensions[1]:
                    # If the second dimension indicates the variable is a string, we will ignore it.
                    # ICARTT seems to have trouble with having strings in the time series data.
                    print(("Skipping string field '{}'".format(var)))
                    continue
                else:
                    # Generally, two dimensional variables should have bins.  We get the name of the second dimension,
                    # whose name should match a coordinate variable that will define the bounds for us.
                    second_dimension = variables[variables[var].dimensions[1]]
                    bound_variable = variables[second_dimension.bounds]
                    for bin_index in range(len(second_dimension)):
                        new_var_columns.append(var + "_" + str(bound_variable[bin_index, 0]) + "-" +
                                               str(bound_variable[bin_index, 1]))
                        data = np.column_stack([data, variables[var][:, bin_index]])
                    continue
            else:
                try:
                    data = np.column_stack([data, variables[var][:]])
                except:
                    print("Failed to properly insert data into the output array. Aborting.")
                    exit()
                new_var_columns.append(var)
        else:
            print(("Not dimensioned by independent variable.  Dropping var '{}'".format(var)))

    # Now we need to modify the independent variable, if time_offset, to be start_time
    if new_var_columns[0] == "time_offset":
        new_var_columns[0] = "start_time"
    # We need to handle cases where the data becomes too negative to distinguish well from the missing values.
    # For example, usually we will have a missing value of -9999.  If a real data point approaches this, we
    # have a harder time telling what is missing data and what is just very negative data.  ICARTT requires that
    # The missing value field be an order of magnitude greater than the most negative data point, so a data point
    # of -12345 requires a missing value of -999999 or less (We will stick with 9's to stay close to ARM conventions.
    # First we get the minimum of the entire dataset, but we exclude -9999, as those are the current missing values.
    absolute_min = np.min(data[data != -9999])
    new_missing_value = -9999
    if absolute_min < -999:
        # If the minimum is lower than -999, it has reached the same magnitude as the current missing value.
        # Calculate the missing value that is one order of magnitude greater than the minimum
        # https://stackoverflow.com/questions/16839190/python-categorising-a-list-by-orders-of-magnitude
        magnitude = int(math.floor(math.log10(abs(absolute_min))))
        # This allows -99999, -999999, -9999999, etc.  The +1 takes it down from a power of 10, which means it
        # loses an order of magnitude.  So we have to increase the magnitude by 2 to make up for this.
        new_missing_value = - pow(10, magnitude + 2) + 1
        # Now we replace all current missing values with the new value
        np.place(data, data == -9999, [new_missing_value])
        print(("Minimum data point {} too low for default missing value. Updated to: {}".format(absolute_min,
                                                                                               new_missing_value)))

    result = pd.DataFrame(data, columns=new_var_columns)

    return result, new_missing_value


def main(ncfile, vars, in_dir, out_dir):
    """
    Parameters
    ----------
    ncfile: string
        The name of the file to convert from netCDF format to ICARTT format.

    """
    # These elements compose the key header information for the ICARTT file format
    header = copy.deepcopy(DEFAULT_HEADER)
    normal_comments = copy.deepcopy(DEFAULT_NORMAL_COMMENTS)
    revision_info = copy.deepcopy(DEFAULT_REVISION_INFO)

    # Get the filename to parse certain information out (instrument name, site, etc.)
    filename = os.path.basename(ncfile)

    launch_number = get_launch_information(filename, in_dir)

    # Read in the netCDF file that is to be converted.
    datagrp = Dataset(ncfile, "r")
725 726
    if 'all' in vars:
        vars = datagrp.variables.keys()
727 728 729

    dimensions, datalength = get_dimensions(datagrp)

730
    var_columns = trans_header(datagrp, header, normal_comments, revision_info, vars)
731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819

    # ICARTT format specifies that if the data interval is greater than one second, it will be set to the value '0'.
    # This means that in addition to a 'start_time' column, we must add a matching 'stop_time' column.
    if header['Data_interval'] == 0:
        var_columns.insert(1, 'stop_time')

    # Translate the data to build a full pandas dataframe. Also gets the missing value from the dataset.
    # Missing value will be set depending on the data, due to how ICARTT requires that the missing value
    # be an order of magnitude lower than the lowest data point.  Defaults to -9999.
    df, missing_value = trans_data(var_columns, datalength, datagrp)

    header["Missing_data"] = get_missing_data(header, missing_value=missing_value)

    # Date string (format "YYYYMMDD") from the filename
    date_string = filename.split('.')[2]

    # Build the ICARTT compliant file name from the available information.
    # Filename will be built slightly differently if there is more than one launch to the data set.  Launch number of
    # 1 is implied if omitted, so by the standards, only add launch number to filename after the first
    if launch_number > 1:
        # By the conventions, the launch number is of the form LX, where X is the launch number, and is the last field
        # before the '.ict' extension.  If we had multiple data volumes per launch, we would have a volume number after
        # the launch number.  Current assumption is that there is one data file per AAF launch.
        launch_string = "L" + str(launch_number)
        output_path = "_".join(
            [x.upper() for x in [filename[3:9], filename[0:3], date_string, normal_comments['REVISION'][-2:],
                                      launch_string]]) + '.ict'
    else:
        output_path = "_".join([x.upper() for x in [filename[3:9], filename[0:3], date_string,
                                                         normal_comments['REVISION'][-2:]]]) + '.ict'

    out_dir = validate_output(out_dir)
    output_path = os.path.join(out_dir, output_path)
    # Write the header first, then the data frame in CSV form
    header_lines = get_lines(header)
    with open(output_path, "w") as ict_file:
        for line in header_lines:
            ict_file.write(line)
        df.to_csv(ict_file, index=False)
    datagrp.close()


def validate_output(out_dir):
    # Check if out_dir exists
    if not os.path.isdir((out_dir)):
        # if not then create it
        os.makedirs(out_dir)

    # change out_dir to include the csv directory
    out_dir = os.path.join(out_dir, 'ascii-icartt')
    # check if the new out_dir exists (it should not)

    if not os.path.isdir(out_dir):
        # if not then create it
        os.makedirs(out_dir)

    # return self.args with new out_dir
    return out_dir

help_description = """
This program converts netcdf files for arial data into csv files in icartt format.
"""

example = """
EXAMPLE: python3 nc2icartt.py -i input/directory -o output/directory -v /path/to/var_list.txt -f /path/to/file_list.txt
"""

def parse_arguments():
    parser = argparse.ArgumentParser(description=help_description, epilog=example,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    requiredArguments = parser.add_argument_group("required arguments")

    requiredArguments.add_argument("-i", "--indir", type=str, dest="in_dir", help="The directory where input files are.")
    requiredArguments.add_argument("-o", "--outdir", type=str, dest="out_dir",
                                   help="The directory to put output file in.")
    requiredArguments.add_argument("-v", "--varlist", type=str, dest="var_list", default=False,
                                   help="The name of a file that contains the names of variables to extract.")
    requiredArguments.add_argument("-f", "--filelist", type=str, dest="file_list",
                                   help="The name of a file that contains the names of files to convert.")

    args, unknownArgs = parser.parse_known_args()
    if len(sys.argv) <= 1:
        parser.print_help()

    return args, unknownArgs

def nc2icartt(main_args):
    print('opening {}'.format(main_args.file_list))
820 821 822 823 824 825 826 827
    try:
        with open(main_args.file_list) as open_file:
            print('reading lines')
            files = open_file.readlines()
    except [TypeError, FileNotFoundError]:
        print("nc2icartt: File error - file_list = {}".format(main_args.var_list))
        return(1)

828 829 830 831 832 833
    print('striping files')
    files = list(map(str.strip, files))
    print('creating full paths to files in {}'.format(main_args.in_dir))
    files = [os.path.join(main_args.in_dir, x) for x in files]

    print('opening {}'.format(main_args.var_list))
834
    if main_args.var_list:
835 836 837
        with open(main_args.var_list) as open_file:
            print('reading variables')
            variables = open_file.readlines()
838 839 840 841 842
    else:
        # todo make this default to all variables if no file found
        variables = ['all']
        # print("nc2icartt: File error - var_list = {}".format(main_args.var_list))
        # return(1)
843

844
    print('striping variables')
845

846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
    variables = list(map(str.strip, variables))

    if len(files) > 0:
        print('processing {} files'.format(len(files)))
        for ncfile in files:
            main(ncfile, variables, main_args.in_dir, main_args.out_dir)
        print('finished processing {} files'.format(len(files)))
        return(0)
    else:
        print("no files in file list.")
        return(1)

if __name__ == '__main__':
    # Takes the first argument as the filename to process. Currently can only handle one file per command
    main_args, unknownArgs = parse_arguments()

    nc2icartt(main_args)