Source code for cloudreg.scripts.run_colm_pipeline_ec2

from .util import start_ec2_instance, run_command_on_server
import argparse
import boto3


[docs]def run_colm_pipeline(
    ssh_key_path,
    instance_id,
    input_s3_path,
    output_s3_path,
    num_channels,
    autofluorescence_channel,
    log_s3_path=None,
    instance_type="r5d.24xlarge",
):
    """Run COLM pipeline on EC2 instance

    Args:
        ssh_key_path (str): Local path to ssh key needed for this server
        instance_id (str): ID of the EC2 instance to run pipeline on
        input_s3_path (str): S3 Path to raw data
        output_s3_path (str): S3 path to store precomputed volume. Volume is stored at output_s3_path/channel for each channel.
        num_channels (int): Number of channels in this volume
        autofluorescence_channel (int): Autofluorescence channel number
        log_s3_path (str, optional): S3 path to store intermediates including vignetting correction and Terastitcher files. Defaults to None.
        instance_type (str, optional): AWS EC2 instance type. Defaults to "r5d.24xlarge".
    """
    # get ec2 client
    ec2 = boto3.resource("ec2")

    public_ip_address = start_ec2_instance(instance_id, instance_type)

    # now run command on instance
    # update the code on the instance
    update_command = "mkdir -p ~/ssd1 ~/ssd2; git clone https://github.com/neurodata/CloudReg.git; cd CloudReg; git pull; docker pull neurodata/cloudreg;"
    print("updating CloudReg code on EC2 instance...")
    errors_update = run_command_on_server(
        update_command, ssh_key_path, public_ip_address
    )
    # mount ssds command
    command1 = "sudo bash CloudReg/cloudreg/scripts/mount_combined_ssds.sh"
    # colm pipeline command
    # command2 = f'time /home/ubuntu/colm_pipeline_env/bin/python CloudReg/scripts/colm_pipeline.py {input_s3_path} {output_s3_path} {num_channels} {autofluorescence_channel} --log_s3_path {log_s3_path}'
    command2 = f"cd CloudReg/; time docker-compose run -v ~/ssd1:/root/ssd1 -v ~/ssd2:/root/ssd2 cloudreg {input_s3_path} {output_s3_path} {num_channels} {autofluorescence_channel} --log_s3_path {log_s3_path}"
    print(command2)
    errors1 = run_command_on_server(command1, ssh_key_path, public_ip_address)
    print(errors1)
    errors2 = run_command_on_server(command2, ssh_key_path, public_ip_address)
    print(errors2)

    # shut down instance
    ec2.meta.client.stop_instances(InstanceIds=[instance_id])


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        "Run COLM pipeline on remote EC2 instance with given input parameters"
    )
    parser.add_argument(
        "-ssh_key_path", help="path to identity file used to ssh into given instance"
    )
    parser.add_argument(
        "-instance_id", help="EC2 Instance ID of instance to run COLM pipeline on."
    )
    parser.add_argument(
        "-input_s3_path",
        help="S3 path to input colm data. Should be of the form s3://<bucket>/<experiment>",
        type=str,
    )
    parser.add_argument(
        "-output_s3_path",
        help="S3 path to store precomputed volume. Precomputed volumes for each channel will be stored under this path. Should be of the form s3://<bucket>/<path_to_precomputed>. The data will be saved at s3://<bucket>/<path_to_precomputed>/CHN0<channel>",
        type=str,
    )
    # parser.add_argument('channel_of_interest', help='Channel of interest in experiment',  type=int)
    parser.add_argument(
        "-num_channels", help="Number of channels in experiment", type=int
    )
    parser.add_argument(
        "-autofluorescence_channel", help="Autofluorescence channel number.", type=int
    )
    parser.add_argument(
        "-log_s3_path",
        help="S3 path at which pipeline intermediates can be stored including bias correctin tile.",
        type=str,
        default="",
    )
    parser.add_argument(
        "--instance_type",
        help="EC2 instance type to run pipeline on. minimum r5d.16xlarge",
        type=str,
        default="r5d.16xlarge",
    )

    args = parser.parse_args()

    run_colm_pipeline(
        args.ssh_key_path,
        args.instance_id,
        args.input_s3_path,
        args.output_s3_path,
        args.num_channels,
        args.autofluorescence_channel,
        args.log_s3_path,
        instance_type=args.instance_type,
    )