Skip to content

Instance Types

Datablast provides different instance types for Python tasks, allowing you to choose the appropriate compute resources based on your workload requirements and performance needs.

Instance types determine the compute resources available to your Python tasks:

  • CPU: Processing power for computation-intensive tasks
  • Memory: RAM for data processing and model training
  • Storage: Temporary storage for intermediate data
  • Network: Bandwidth for data transfer and API calls
Instance TypeCPU LimitMemory LimitCPU RequestMemory RequestUse Case
d1.nano250m512Mi250m256MiLightweight tasks, testing (Default)
d1.small500m1200Mi500m1GiSmall data processing
d1.medium750m2400Mi750m2GiMedium workloads
d1.large14400Mi14GiLarge data processing
d1.xlarge26600Mi26GiHeavy workloads, ML training

d1.nano is the default instance type - no need to specify unless you need more resources.

⚠️ Important: Using instance types other than d1.nano may incur additional charges. Please consult with your Datablast representative for pricing details before upgrading instance types.

name: "ml_models.churn_prediction"
type: "python"
run: "churn_prediction.py"
instance: "d1.large"
ml_models.churn_prediction
# @blast.type: python
# @blast.instance: d1.large
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
# Your ML model training code here

Best for: Lightweight tasks, testing, simple data processing

data_processing.simple_transform
# @blast.type: python
# @blast.instance: d1.nano # Default, can be omitted
import pandas as pd
# Simple data transformation
data = pd.read_csv('input.csv')
data['processed'] = data['value'] * 2
data.to_csv('output.csv', index=False)

Characteristics:

  • Minimal resource usage
  • Fast startup time
  • Cost-effective for simple tasks
  • Suitable for testing and development

Best for: Small data processing, basic analytics

analytics.basic_metrics
# @blast.type: python
# @blast.instance: d1.small
import pandas as pd
import numpy as np
# Basic analytics and metrics calculation
data = pd.read_csv('events.csv')
metrics = {
'total_events': len(data),
'unique_users': data['user_id'].nunique(),
'avg_duration': data['duration'].mean()
}

Characteristics:

  • Moderate resource usage
  • Good for small to medium datasets
  • Balanced performance and cost
  • Suitable for basic analytics

Best for: Medium data processing, feature engineering

ml_models.feature_engineering
# @blast.type: python
# @blast.instance: d1.medium
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
# Feature engineering for ML models
data = pd.read_csv('user_data.csv')
features = ['age', 'income', 'activity_score']
scaler = StandardScaler()
data[features] = scaler.fit_transform(data[features])

Characteristics:

  • Good for medium datasets
  • Suitable for feature engineering
  • Balanced CPU and memory
  • Cost-effective for most workloads

Best for: Large data processing, ML model training

ml_models.model_training
# @blast.type: python
# @blast.instance: d1.large
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
# ML model training
data = pd.read_csv('training_data.csv')
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Save the model
joblib.dump(model, 'model.pkl')

Characteristics:

  • High performance for large datasets
  • Suitable for ML model training
  • Good for complex data processing
  • Higher cost but better performance

Best for: Heavy workloads, large-scale ML training

ml_models.deep_learning
# @blast.type: python
# @blast.instance: d1.xlarge
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
# Deep learning model training
data = pd.read_csv('large_dataset.csv')
X = data.drop('target', axis=1).values
y = data['target'].values
# Build and train deep learning model
model = keras.Sequential([
keras.layers.Dense(128, activation='relu', input_shape=(X.shape[1],)),
keras.layers.Dropout(0.3),
keras.layers.Dense(64, activation='relu'),
keras.layers.Dropout(0.3),
keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=100, batch_size=32, validation_split=0.2)

Characteristics:

  • Maximum performance
  • Suitable for deep learning
  • Best for large-scale processing
  • Highest cost but maximum performance
import psutil
import os
def check_memory_usage():
"""Check current memory usage."""
process = psutil.Process(os.getpid())
memory_info = process.memory_info()
memory_mb = memory_info.rss / 1024 / 1024
print(f"Memory usage: {memory_mb:.2f} MB")
# Check if we're approaching limits
if memory_mb > 400: # Approaching d1.large limit
print("Warning: High memory usage detected")
# Use in your task
check_memory_usage()
import psutil
import os
def check_cpu_usage():
"""Check current CPU usage."""
cpu_percent = psutil.cpu_percent(interval=1)
print(f"CPU usage: {cpu_percent}%")
# Check if we're approaching limits
if cpu_percent > 80:
print("Warning: High CPU usage detected")
# Use in your task
check_cpu_usage()
  • Start with d1.nano and scale up as needed
  • Monitor resource usage and adjust accordingly
  • Use larger instances only when necessary
  • Consider cost vs performance trade-offs
import psutil
import time
import logging
logger = logging.getLogger(__name__)
def monitor_resources():
"""Monitor resource usage during task execution."""
start_time = time.time()
while True:
# Check memory usage
memory_info = psutil.virtual_memory()
memory_percent = memory_info.percent
# Check CPU usage
cpu_percent = psutil.cpu_percent(interval=1)
logger.info(f"Memory: {memory_percent}%, CPU: {cpu_percent}%")
# Break if task is complete
if time.time() - start_time > 300: # 5 minutes
break
time.sleep(30) # Check every 30 seconds
# Use in your task
monitor_resources()
  • Start with the smallest instance that meets your needs
  • Monitor resource usage and scale up if necessary
  • Consider the cost implications of larger instances
  • Test performance with different instance types
  • Monitor memory and CPU usage
  • Optimize code to use resources efficiently
  • Use appropriate data structures and algorithms
  • Implement proper error handling
  • Use larger instances only when necessary
  • Monitor costs and optimize accordingly
  • Consider alternative approaches for cost-sensitive tasks
  • Document instance requirements and justifications
  • Profile your code to identify bottlenecks
  • Use appropriate libraries and frameworks
  • Optimize data processing pipelines
  • Implement caching where appropriate
  • Issue: Task fails with memory errors
  • Solution: Upgrade to a larger instance type
  • Debug: Monitor memory usage and optimize code
  • Issue: Task runs slowly
  • Solution: Upgrade to a larger instance type or optimize code
  • Debug: Profile code and monitor resource usage
  • Issue: Unexpected high costs
  • Solution: Optimize code and use appropriate instance types
  • Debug: Monitor resource usage and cost patterns
import psutil
import os
import logging
logger = logging.getLogger(__name__)
def debug_resources():
"""Debug resource usage."""
# System information
logger.info(f"CPU count: {psutil.cpu_count()}")
logger.info(f"Memory total: {psutil.virtual_memory().total / 1024 / 1024 / 1024:.2f} GB")
# Process information
process = psutil.Process(os.getpid())
logger.info(f"Process memory: {process.memory_info().rss / 1024 / 1024:.2f} MB")
logger.info(f"Process CPU: {process.cpu_percent()}%")
# Environment information
logger.info(f"Instance type: {os.getenv('BLAST_INSTANCE_TYPE', 'unknown')}")
# Call debug function
debug_resources()