Last updated

Metadata and Filtering: Organizing Your AI Data 🔍

Master the art of organizing and finding your data using Dataloop's powerful metadata and filtering capabilities.

Working with Metadata 📝

1. Adding Metadata

import dtlpy as dl

# Add metadata to an item
item = dataset.items.get(item_id='your-item-id')
item.metadata['user'] = {
    'photographer': 'John Doe',
    'location': 'New York',
    'camera': {
        'model': 'Canon EOS R5',
        'settings': {
            'iso': 100,
            'aperture': 'f/2.8',
            'shutter_speed': '1/1000'
        }
    },
    'tags': ['outdoor', 'daylight']
}
item = item.update()

# Add metadata during upload
item = dataset.items.upload(
    local_path='/path/to/image.jpg',
    metadata={
        'user': {
            'project_id': 'PRJ-123',
            'batch': 'B-001'
        }
    }
)

2. Updating Metadata

# Update specific fields
item.metadata['user']['status'] = 'reviewed'
item.metadata['user']['last_modified'] = '2024-03-20'
item = item.update()

# Batch update metadata
filters = dl.Filters(field='dir', values='/batch1')
dataset.items.update(
    filters=filters,
    update_values={
        'user.status': 'processed',
        'user.batch': 'B-001'
    }
)

3. Metadata Schema

# Define a metadata schema
schema = {
    "type": "object",
    "properties": {
        "user": {
            "type": "object",
            "properties": {
                "status": {
                    "type": "string",
                    "enum": ["new", "in-progress", "reviewed"]
                },
                "quality": {
                    "type": "integer",
                    "minimum": 1,
                    "maximum": 5
                }
            }
        }
    }
}

# Apply schema to dataset
dataset.metadata_schema = schema
dataset.update()

Advanced Filtering 🎯

1. Basic Filters

# Create filters
filters = dl.Filters()

# Filter by filename
filters.add(field='filename', values='*.jpg')

# Filter by directory
filters.add(field='dir', values='/raw-data')

# Filter by created date
filters.add(field='createdAt', values="2024-03-*")

# Get filtered items
pages = dataset.items.list(filters=filters)

2. Metadata Filters

# Filter by metadata fields
filters = dl.Filters()

# Exact match
filters.add(field='metadata.user.status', values='reviewed')

# Multiple values
filters.add(field='metadata.user.tags', values=['outdoor', 'daylight'], operator=dl.FiltersOperations.IN)

# Larger than, smaller than
filters.add(field='metadata.user.camera.settings.iso', 
           values=100,
           operator=dl.FiltersOperations.GREATER_THAN)

# Nested field exists
filters.add(field='metadata.user.camera', 
           operator=dl.FiltersOperations.EXISTS)

3. Complex Queries

# Combining multiple filters
filters = dl.Filters(resource=dl.FiltersResource.ITEM)

# AND operation (default)
filters.add(field='metadata.user.status', values='reviewed')
filters.add(field='metadata.user.quality', values=5)

# OR operation
filters.add(field='metadata.user.tags', values=['important', 'urgent'],
           operator=dl.FiltersOperations.OR)

# NOT operation
filters.add_join(field='label', values='rejected',
                operator=dl.FiltersOperations.NOT)

2. Pagination and Sorting

# Get items with pagination
filters = dl.Filters()
pages = dataset.items.list(
    filters=filters,
    page_offset=0,
    page_size=50
)

# Sort results
filters.sort_by(field='metadata.user.quality', value=dl.FiltersOrderByDirection.ASCENDING)

Practical Examples 💡

1. Quality Control Pipeline

def quality_control_pipeline(dataset):
    """Filter and process high-quality items"""
    # Get high-quality, reviewed items
    filters = dl.Filters()
    filters.add(field='metadata.user.quality', values=[4, 5])
    filters.add(field='metadata.user.status', values='reviewed')
    
    high_quality_items = dataset.items.list(filters=filters)
    
    # Process items
    for item in high_quality_items:
        process_high_quality_item(item)

2. Data Organization

def organize_by_metadata(dataset):
    """Organize items into folders based on metadata"""
    filters = dl.Filters()
    items = dataset.items.list(filters=filters)
    
    for item in items:
        # Get metadata values
        category = item.metadata['user'].get('category', 'uncategorized')
        
        # Create category folder
        new_path = f'/{category}/{item.name}'
        
        # Move item
        dataset.items.move(item=item, new_path=new_path)

3. Batch Processing

def process_unreviewed_items(dataset):
    """Find and process unreviewed items"""
    # Create filter for unreviewed items
    filters = dl.Filters()
    filters.add(field='metadata.user.status', values='new')
    
    # Get items in batches
    page_size = 100
    pages = dataset.items.list(
        filters=filters,
        page_size=page_size
    )
    
    for page in pages:
        for item in page:
            # Process item
            process_item(item)
            
            # Update status
            item.metadata['user']['status'] = 'processed'
            item.update()

Best Practices 👑

1. Metadata Structure

  • Use consistent naming conventions
  • Keep metadata hierarchical
  • Document metadata schema
  • Validate metadata values

2. Query Optimization

# Use specific fields when possible
filters.add(field='metadata.user.status', values='reviewed')  # ✅

# Combine filters efficiently
filters = dl.Filters(resource=dl.FiltersResource.ITEM)
filters.add(field='dir', values='/dataset1')
filters.add(field='metadata.user.status', values='reviewed')

3. Error Handling

def safe_metadata_update(item, updates):
    """Safely update item metadata"""
    try:
        for key, value in updates.items():
            item.metadata['user'][key] = value
        return item.update()
    except Exception as e:
        print(f"Error updating metadata: {str(e)}")
        return None

Ready to explore task management? Let's move on to the next chapter! 🚀