Commit 177e1013 authored by Carina Lansing's avatar Carina Lansing
Browse files

Fixed the container so it would run on shifter where no write access is...

Fixed the container so it would run on shifter where no write access is allowed inside the container.  Now the datastream directory points to the outputs folder which is mounted from the host machine.
parent 640fb3a6
......@@ -36,8 +36,7 @@ COPY scripts /bin/
################################################################
# Set up LASSO data folders
################################################################
RUN mkdir -p /data/lasso/collection/sgp && \
mkdir /data/lasso/inputs && \
RUN mkdir -p /data/lasso/inputs && \
mkdir /data/lasso/outputs
COPY data /data
......
......@@ -17,3 +17,6 @@ docker tag lasso-o_shcu registry.gitlab.com/gov-doe-arm/docker/lasso-o_shcu
# To deploy to registry, we do this:
docker push registry.gitlab.com/gov-doe-arm/docker/lasso-o_shcu
# TODO: we may switch the image to DockerHub at clansing/lasso-o_shcu:1.0
......@@ -11,10 +11,8 @@ import shutil
# Global parameters
data_home = '/data/lasso'
input_dir = f'{data_home}/inputs' # should mount to collections/sgp folder on host
output_dir = f'{data_home}/outputs' # should mount to any folder on host
collection_dir = f'{data_home}/collection/sgp'
datastream_dir = f'{data_home}/datastream/sgp'
input_dir = f'{data_home}/inputs' # should mount to data/inputs folder on host
output_dir = f'{data_home}/outputs' # should mount to data/outputs folder on host
# Passed via container environment params - e.g., (20180710.115900)
begin_datetime = os.environ['BEGIN_DATETIME']
......@@ -32,9 +30,10 @@ def get_wrfstat_filename(sim_num):
def get_sim_numbers():
sim_numbers = []
wrfout_pattern = re.compile('sgpwrfout(.)C1\\.00')
sgp_dir = os.path.join(input_dir, 'sgp')
for filename in os.listdir(input_dir):
if os.path.isdir(os.path.join(input_dir, filename)):
for filename in os.listdir(sgp_dir):
if os.path.isdir(os.path.join(sgp_dir, filename)):
match = wrfout_pattern.match(filename)
if match:
sim_num = match.group(1)
......@@ -42,74 +41,20 @@ def get_sim_numbers():
# Make sure there is also a corresponding wrfstat folder
wrfstat_filename = get_wrfstat_filename(sim_num)
if not os.path.isdir(os.path.join(input_dir, wrfstat_filename)):
if not os.path.isdir(os.path.join(sgp_dir, wrfstat_filename)):
raise Exception(f'Could not find matching wrfstat datastream for {filename}')
return sim_numbers
def copy_builtin_data(datastream_name):
source = os.path.join(datastream_dir, datastream_name)
dest = os.path.join(output_dir, datastream_name)
builtin_datastream_dir = f'{data_home}/datastream/sgp'
source = os.path.join(builtin_datastream_dir, datastream_name)
dest = os.path.join(output_dir, 'sgp', datastream_name)
# Copy the content of source to destination
destination = shutil.copytree(source, dest)
def create_sym_link(source_file, linked_file):
if not os.path.exists(linked_file):
# Make a symbolic link
os.symlink(source_file, linked_file)
def link_input_data(sim_num):
# Create a sym link from all files in the input dir to their corresponding folder
# under the /data/lasso/collection/sgp dir
def link_files_in_folder(foldername):
source_dir = os.path.join(input_dir, foldername)
dest_dir = os.path.join(collection_dir, foldername)
pathlib.Path(dest_dir).mkdir(parents=True, exist_ok=True)
for filename in os.listdir(source_dir):
filepath = os.path.join(source_dir, filename)
if os.path.isfile(filepath):
create_sym_link(os.path.join(source_dir, filename), os.path.join(dest_dir, filename))
link_files_in_folder(get_wrfstat_filename(sim_num))
link_files_in_folder(get_wrfout_filename(sim_num))
def link_output_datastream_dir(filename):
source_dir = os.path.join(output_dir, filename)
link_dir = os.path.join(datastream_dir, filename)
# make sure the output directory exists, then link it
pathlib.Path(source_dir).mkdir(parents=True, exist_ok=True)
create_sym_link(source_dir, link_dir)
def link_output_simulation_data(sim_num):
# Sym link the output datastreams that we care about to the output folder.
# All the other datastreams will be gone when the container finishes
link_output_datastream_dir(f'sgplassodiagmod{sim_num}C1.m1')
link_output_datastream_dir(f'sgplassodiagobsmod{sim_num}C1.m1')
link_output_datastream_dir(f'sgplassodiagobsmodz{sim_num}C1.m1')
link_output_datastream_dir(f'sgplassomod{sim_num}C1.m1')
# Turns out we have to sym link all the outputs because otherwise the
# container gets too big, and it will fail
link_output_datastream_dir(f'sgpwrfout{sim_num}C1.00')
link_output_datastream_dir(f'sgpwrfout{sim_num}C1.m0')
link_output_datastream_dir(f'sgpwrfstat{sim_num}C1.00')
link_output_datastream_dir(f'sgpwrfstat{sim_num}C1.m0')
def link_log_folder():
source_dir = os.path.join(output_dir, 'logs')
link_dir = '/data/lasso/logs'
pathlib.Path(source_dir).mkdir(parents=True, exist_ok=True)
create_sym_link(source_dir, link_dir)
# Copy the content of source to destination if it does not already exist
if not os.path.isdir(dest):
destination = shutil.copytree(source, dest)
def get_env():
......@@ -117,10 +62,10 @@ def get_env():
# Set data environment variables
env['DATA_HOME'] = data_home
env['COLLECTION_DATA'] = f'{data_home}/collection'
env['COLLECTION_DATA'] = f'{data_home}/inputs'
env['CONF_DATA'] = f'{data_home}/conf'
env['DATASTREAM_DATA'] = f'{data_home}/datastream'
env['LOGS_DATA'] = f'{data_home}/logs'
env['DATASTREAM_DATA'] = f'{data_home}/outputs'
env['LOGS_DATA'] = f'{data_home}/outputs/logs'
env['DB_CONNECT_PATH'] = '/data/db'
env['PATH'] ='/apps/ds/bin:/apps/process/bin:/apps/tool/bin:/apps/transfer/bin:/apps/base/bin:/usr/bin:/bin:/usr/sbin:/sbin:.'
......@@ -185,14 +130,16 @@ def main():
# First find the simulation numbers in the user input
sim_numbers = get_sim_numbers()
# Link the log folder so user can check log files from output dir
link_log_folder()
# We have to copy the built-in datasets to the outputs folder so that the
# outputs folder can be used as DATASTREAM_DATA folder since only
# mounted folders are writable when running via shifter
copy_builtin_data('sgpcldfracset01mC1.c1')
copy_builtin_data('sgpcldfracset15mC1.c1')
copy_builtin_data('sgplassodiagobsC1.c1')
# Now iterate over the sim numbers, running the LASSO processes in sequence
# TODO: later we can run these in parallel with dask
for sim_number in sim_numbers:
link_input_data(sim_number)
link_output_simulation_data(sim_number)
run_wrfstat(sim_number) # wrfstat ingest
run_wrfout(sim_number) # wrfout ingest
run_lassomod(sim_number) # lassomod vap
......@@ -200,20 +147,10 @@ def main():
run_lassodiagobsmodz(sim_number) # lassodiagobsmodz vap
run_lassodiagobsmod(sim_number) # lassodiagobsmod vap
# Link the lassocore outputs
link_output_datastream_dir('sgplassoscoreC1.m1')
link_output_datastream_dir('sgplassoscorezC1.m1')
# lassoscore vaps
run_lassoscorez()
run_lassoscore()
# Finally, we have to copy the built-in datasets to the outputs folder so that the user can use them
# in the jupyter notebooks after the container has finished
copy_builtin_data('sgpcldfracset01mC1.c1')
copy_builtin_data('sgpcldfracset15mC1.c1')
copy_builtin_data('sgplassodiagobsC1.c1')
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment