Ensemble Learning with Sagemaker and Step-Functions
- Dr. Benjamin Weigel | 09.09.2019
Hamburg, Germany
Ensemble Learning with Sagemaker and Step-Functions Dr. Benjamin - - PowerPoint PPT Presentation
Ensemble Learning with Sagemaker and Step-Functions Dr. Benjamin Weigel | 09.09.2019 Hamburg, Germany Benjamin Weigel Data Engineer & Cloud Coordinator Europace AG https://www.europace.de/ There is manual efgort in obtaining a mortgage
Hamburg, Germany
Data Engineer & Cloud Coordinator Europace AG
https://www.europace.de/
Text Model Image Model Sequence Model trained on OCR-extracted text trained on page-bitmap trained on sequence information (i.e. “Page 1-4 is a contract”) use output as input
series of steps
○ a state and the transition to the next ○ error-conditions etc.
{ "Comment" : "An example of the Amazon States Language." , "StartAt" : "FirstState" , "States" : { "FirstState" : { "Type": "Task", "Resource" : "arn:aws:lambda:us-east-1:123456789012:function:..." , "Next": "ChoiceState" }, ... } }
"ChoiceState": { "Type": "Choice", "Choices": [ { "Variable": "$.foo", "NumericEquals": 1, "Next": "FirstMatchState" }, { "Variable": "$.foo", "NumericEquals": 2, "Next": "SecondMatchState" } ], "Default": "DefaultState" }
Transform and Training Jobs directly via these Resources:
"arn:aws:states:::sagemaker:createTransformJob.sync" "arn:aws:states:::sagemaker:createTrainingJob.sync"
https://docs.aws.amazon.com/step-functions/latest/dg/connect-sagemaker.html
https://docs.aws.amazon.com/step-functions/latest/dg/connect-sagemaker.html "Image Model Training": { "Type": "Task", "Resource": "arn:aws:states:::sagemaker:createTrainingJob.sync", "Parameters": { "TrainingJobName": "ImageModel", "AlgorithmSpecification": { "TrainingImage": "520713654638.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-mxnet:1.3-gpu-py3", "TrainingInputMode": "File" }, "HyperParameters": { "epochs": "80", "batch_size": "10", "conv_block_length": "2", "cycle_length": "10", "depth": "5", "dropout": "0.5", "max_lr": "0.1", "min_lr": "0.0001", ... "start_filter": "4", "worker": "4" }, "InputDataConfig.$": "$.generated.image_model.InputDataConfig", "OutputDataConfig": { "S3OutputPath.$": "$.generated.output_artifact_paths.image_model_prefix" }, "ResourceConfig": { "InstanceCount": 4, "InstanceType": "ml.p2.xlarge", "VolumeSizeInGB": 10 }, "RoleArn": "arn:aws:iam::123456789012:role/sm-stepfunction-iam-role", "StoppingCondition": { "MaxRuntimeInSeconds": 172800 } } } https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTrainingJob.html#API_CreateTrainingJob_RequestSyntax
Photo by Joshua Ness on Unsplash
{ "StartAt": "Train Text Model", "States": { "Train Text Model": { "Type": "Task", "Resource": "arn:aws:states:::sagemaker:createTrainingJob.sync", "Parameters": { ... }, "End": true } } }
{ "StartAt": "Fetch Preprocessed Data" , "States": { "Fetch Preprocessed Data": { "Type": "Task", "Resource": "arn:aws:states:::batch:submitJob.sync" , "Next": "Train Text Model" , "Parameters": { "JobName": "FetchPreparedData" , "JobDefinition": "arn:aws:batch: us-east-1:1234567890:job-definition/job:2 ", "JobQueue": "arn:aws:batch:us-east-1:1234567890:job-queue/queue" , "Parameters": { "DATA_INPUT_PATH.$": "$.input_data" , "OUTPUT_PATH.$": "$.ready_to_use_artifacts" } } }, "Train Text Model": { ... } } }
"Train Text Model": { "Type": "Task", "Resource": "arn:aws:states:::sagemaker:createTrainingJob.sync", ... "Retry": [ { "ErrorEquals": [ "SageMaker.AmazonSageMakerException" ], "IntervalSeconds": 1, "MaxAttempts": 100, "BackoffRate": 1.1 }, ... ] }
"Train Text Model": { "Type": "Task", ... "Catch": [{ "ErrorEquals": ["States.ALL" ], "Next": "Notify Failure" }] }, "Notify Failure": { "Type": "Task", "Resource": "arn:aws:states:::sns:publish" , "End": true, "Parameters": { "Subject": "[ERROR] Model Training failed!" , "Message": "Error during model training!" , "TopicArn": "arn:aws:sns:*:123456789012:alerting_topic" , "MessageAttributes": { ... } } }
...it’s a valid state after all
"Notify Failure": { "Type": "Task", "Resource": "arn:aws:states:::sns:publish" , "Next": "Fail", ... }, "Fail": { "Type": "Fail" }
Text Model Image Model Sequence Model trained on OCR-extracted text trained on page-bitmap trained on sequence information (i.e. “Page 1-4 is a contract”) use output as input
"Fetch Preprocessed Data": { ... "Next": "Base Model Training" }, "Base Model Training": { "Type": "Parallel" , "Next": "Train Sequence Model" , "Branches": [ { "StartAt": "Train Image Model" , "States": { "Train Image Model": { ... "End": true } } },{ "StartAt": "Train Text Model" , "States": { "Train Text Model": { ... "End": true }}}]},
notification trigger won’t fire because there is no state defined for this scenario
"Base Model Training": { "Type": "Parallel", "Next": "Train Sequence Model" , "Branches": [...], "Catch": [ { "ErrorEquals": [ "States.ALL" ], "Next": "Notify Failure" } ] }
then something fails and you have to debug (rerun) …
skip some steps...
"States": { "Skip Image Model Training?": { "Type": "Choice" , "Choices": [ { "Variable": "$.train_image_model" , "BooleanEquals": false, "Next": "Skip Fetch Preprocessing Artifacts" } ], "Default": "Train Image Model" }, "Skip Fetch Preprocessing Artifacts": { "Type": "Pass", "End": true }, "Train Image Model": { ... "End": true } }
...and add a little sprinkle on top
machine execution
S3 (each execution gets its
Data (S3) Models & Data (S3) “Setup” Input
Photo by Markus Spiske on Unsplash
{ "initialization": { "fetch_data": { "image_model_artifact_path": "", "preprocessed_data_path": "s3://data/2019-08-31T21:53:12+0200" , "text_model_artifact_path": "" }, "image_model": { "batch_size": "128", "instance_type": "ml.p2.xlarge" }, "output_artifact_target_base_path": "s3://data/ model_training_data" , "run_training_steps": { "image_model": true, "text_model": true, "fetch_preprocessing_artifacts": true, "generate_image_model_split": true }, "sequence_model": { ... }, "text_model": { ... } } }
initial input generated parametrization of state machine via “setup” function max 32768 characters for input/result !
"Image Model Training" : { "Type": "Task", "Resource" : "arn:aws:states:::sagemaker:createTrainingJob.sync" , "Parameters" : { "TrainingJobName.$" : "$.generated.image_model.TrainingJobName" , "HyperParameters" : { "epochs" : "80", "batch_size.$" : "$.initialization.image_model.batch_size" , "bucket.$" : "$.initialization.image_model.log_bucket" , "conv_block_length" : "2", "cycle_length" : "10", "depth": "5", "dropout" : "0.5", "job_name.$" : "$.generated.image_model.TrainingJobName" , "max_lr" : "0.1", "min_lr" : "0.0001" , "worker" : "4" }, "InputDataConfig.$" : "$.generated.image_model.InputDataConfig" , "ResourceConfig" : { "InstanceCount" : 4, "InstanceType.$" : "$.initialization.image_model.instance_type" ... { "initialization": { "fetch_data": { "image_model_artifact_path": "", "preprocessed_data_path": "s3://data/2019-08-31T21:53:12+0200" , "text_model_artifact_path": "" }, "image_model": { "batch_size": "128", "instance_type": "ml.p2.xlarge" }, "output_artifact_target_base_path": "s3://data/model_training_data" , "run_training_steps": { "image_model": true, "text_model": true, "fetch_preprocessing_artifacts": true, "generate_image_model_split": true }, "sequence_model": { ... }, "text_model": { ... } } }
"Is Capacity Error?" : { "Type": "Choice", "Comment": "Retry if capacity error." , "Choices": [ { "Variable": "$.error-info.Cause.FailureReason" , "StringEquals" : "CapacityError: Unable to provision requested ML compute
different ML instance type." , "Next": "Wait 10 Minutes" } ], "Default": "FailImageModel" },
No notification about failure!
No notification about failure!
Photo by Zoltan Tasi on Unsplash
"HyperParameters": { "s3_log_folder": "\"logs/sagemaker\"", "job_name.$": "$.generated.text_model.TrainingJobName", "sagemaker_container_log_level": "20", "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_program": "\"sagemaker_entry_point.py\"", "sagemaker_region": "\"${AWS::Region}\"", "sagemaker_submit_directory": "\"${textModelArtifactPath}\"" },
"HyperParameters": { "s3_log_folder": "\"logs/sagemaker\"", "job_name.$": "$.generated.text_model.TrainingJobName", "sagemaker_container_log_level": "20", "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_program": "\"sagemaker_entry_point.py\"", "sagemaker_region": "\"${AWS::Region}\"", "sagemaker_submit_directory": "\"${textModelArtifactPath}\"" },
"HyperParameters": { "s3_log_folder": "\"logs/sagemaker\"", "job_name.$": "$.generated.text_model.TrainingJobName", "sagemaker_container_log_level": "20", "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_program": "\"sagemaker_entry_point.py\"", "sagemaker_region": "\"${AWS::Region}\"", "sagemaker_submit_directory": "\"${textModelArtifactPath}\"" },
"HyperParameters": { "s3_log_folder": "\"logs/sagemaker\"", "job_name.$": "$.generated.text_model.TrainingJobName", "sagemaker_container_log_level": "20", "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_program": "\"sagemaker_entry_point.py\"", "sagemaker_region": "\"${AWS::Region}\"", "sagemaker_submit_directory": "\"${textModelArtifactPath}\"" },
class TrainingEnvironment(ContainerEnvironment): # TODO expecting serialized hyperparams might break containers that aren't launched by python sdk @staticmethod def _deserialize_hyperparameters(hp): ... for (k, v) in hp.items(): ... hyperparameter_dict[k] = json.loads(v)
Cloudformation
wrong?
SendSnsStateMachine : Type: 'AWS::StepFunctions::StateMachine' Properties : StateMachineName : 'send-hello-world-sns' RoleArn: !GetAtt Role.Arn DefinitionString : |- { "StartAt" : "HelloWorld" , "States" : { "HelloWorld" : { "Type": "Task", "Resource" : "arn:aws:states:::sns:publish" , "Parameters" : { "TopicArn" : "arn:aws:sns:eu-central-1:0123456789:hello-world" , "Message" : { "Input": "Hello from Step Functions!" , } }, "End": true } } }
import cdk = require('@aws-cdk/core' ); import stepfunction = require('@aws-cdk/aws-stepfunctions' ); import stepfunctionTasks = require('@aws-cdk/aws-stepfunctions-tasks' ); ... new stepfunction .CfnStateMachine (this, "state-machine" , { definitionString: fs. readFileSync ("./lib/statemachine.json" ).toString (), roleArn: "..." }); const dataBucket = new S3.Bucket(this, "data-bucket" ) const startState = new stepfunction .Pass(this, 'StartState' ); const trainText = new stepfunction .Task(this, "SageMaker" , { task: new stepfunctionTasks .SagemakerTrainTask ({ trainingJobName: "TextModelTraining" , inputDataConfig: [ { channelName: "channel_1" , dataSource: { s3DataSource: { s3Location: stepfunctionTasks.S3Location. fromBucket (dataBucket, "input") } } } ], ... }) }); const definition = startState. next(trainText) new stepfunction .StateMachine (this, 'StateMachine' , { definition: definition }); { "StartAt": "Train Text Model" , "States": { "Train Text Model": { "Type": "Task", "Resource": "arn:aws:states:::sagemaker:createTrainingJob.sync" , "Parameters": { ... }, "End": true } } }
workflow
"PrepareXY": { "Type": "Pass", "Result": { "x": 0.381018, "y": 622.2269926397355 }, "ResultPath": "$.coords", "Next": "FindArealPhoto" }, "FindArealPhoto": { "Type": "Task", ... "End": true }
Parameters
@dreigelb