Last updated

Advanced Item and Annotation Examples 🎓

Welcome to our advanced guide for working with items and annotations! Here you'll find detailed examples and advanced techniques for managing your data in Dataloop.

Advanced Item Examples and Operations 🎯

Advanced Upload Scenarios 🚀

Batch Upload with Metadata and Annotations

Want to upload multiple items with their metadata and annotations? Here's how to use a Pandas DataFrame:

import pandas as pd
import dtlpy as dl

# Prepare your data
upload_data = [
    {
        'local_path': r"E:\TypesExamples\000000000064.jpg",
        'local_annotations_path': r"E:\TypesExamples\000000000776.json",
        'remote_path': '/first',
        'remote_name': 'f.jpg',
        'item_metadata': {'user': {'category': 'first'}}
    },
    {
        'local_path': r"E:\TypesExamples\000000000776.jpg",
        'local_annotations_path': r"E:\TypesExamples\000000000776.json",
        'remote_path': "/second",
        'remote_name': 's.jpg',
        'item_metadata': {'user': {'category': 'second'}}
    }
]

# Create DataFrame
df = pd.DataFrame(upload_data)

# Upload with DataFrame
dataset = dl.datasets.get(dataset_id='your-dataset-id')
items = dataset.items.upload(
    local_path=df,
    overwrite=True
)

Working with Different File Types 📁

Image Arrays with OpenCV

import cv2
import numpy as np

# Create or load your array
img_array = cv2.imread('path/to/image.jpg')
# Or create a random array
random_array = np.random.rand(100, 100, 3) * 255
random_array = random_array.astype(np.uint8)

# Upload array (remember to specify remote_name!)
item = dataset.items.upload(
    local_path=random_array,
    remote_name='generated_image.jpg'  # Only .jpg or .png formats are supported!
)

# Download as array
buffer = item.download(
    save_locally=False,  # Returns a buffer instead of saving to disk
    to_array=True       # Converts the buffer directly to a numpy array
)

Important Notes:

  • Use save_locally=False to get a buffer instead of saving to disk
  • Use to_array=True to get the buffer as a numpy array

Image Arrays with PIL

from PIL import Image
import numpy as np

# Load with PIL
pil_image = Image.open('path/to/image.jpg')
np_array = np.asarray(pil_image)

# Upload the array
item = dataset.items.upload(
    local_path=np_array,
    remote_name='pil_image.jpg'
)

Advanced Metadata Operations 📊

Complex Metadata Structure

# Prepare complex metadata
metadata = {
    'user': {
        'categories': ['dog', 'cat'],
        'attributes': {
            'size': 'large',
            'colors': ['brown', 'white'],
            'age': 3
        },
        'validation': {
            'verified': True,
            'verified_by': 'john.doe@example.com',
            'verified_date': '2024-01-01'
        }
    }
}

# Upload with complex metadata
item = dataset.items.upload(
    local_path='path/to/image.jpg',
    remote_name='pet.jpg',
    item_metadata=metadata
)

Batch Metadata Update

# Update metadata for multiple items
filters = dl.Filters()
filters.add(field='dir', values='/pets')

update_values = {
    'user': {
        'batch_processed': True,
        'process_date': '2024-01-01'
    }
}

dataset.items.update(
    filters=filters,
    update_values=update_values
)

Working with Large Datasets 🗄️

Progress Tracking

import tqdm

# Get all items
filters = dl.Filters()
pages = dataset.items.list(filters=filters)

# Create progress bar
pbar = tqdm.tqdm(total=pages.items_count)

# Process items with progress
for item in pages.all():
    # Your processing logic here
    process_item(item)
    pbar.update()

Parallel Processing

from concurrent.futures import ThreadPoolExecutor
import tqdm

def process_item(item):
    # Your processing logic here
    return True

# Create progress bar
pbar = tqdm.tqdm(total=pages.items_count)

# Process in parallel
with ThreadPoolExecutor(max_workers=32) as executor:
    futures = []
    for item in pages.all():
        future = executor.submit(process_item, item)
        futures.append(future)
        
    # Track progress
    for future in futures:
        future.result()
        pbar.update()

Advanced Annotation Examples and Operations 🎯

Welcome to our advanced guide for working with annotations! Here you'll find detailed examples and techniques for managing complex annotation scenarios in Dataloop.

Working with Different Formats 📦

Converting from COCO Format

converter = dl.Converter()
converter.upload_local_dataset(
    from_format=dl.AnnotationFormat.COCO,
    dataset=dataset,
    local_items_path=r"C:/path/to/items",
    # Make sure item names match the COCO JSON file
    local_annotations_path=r"C:/path/to/annotations/file/coco.json"
)

Working with VTT Format

Perfect for video transcription annotations:

# Local paths
local_item_path = r"/Users/local/path/to/item.png"
local_vtt_path = r"/Users/local/path/to/subtitles.vtt"

# Upload item
item = dataset.items.upload(local_path=local_item_path)

# Upload VTT file - wait for item upload to complete
builder = item.annotations.builder()
builder.from_vtt_file(filepath=local_vtt_path)
item.annotations.upload(builder)

Video Annotations 🎥

Adding Time-Based Annotations

Here's how to handle annotations that span multiple frames with visibility changes:

import pandas as pd

# Read annotation data from CSV
df = pd.read_csv(r"C:/file.csv")

# Get video item
item = dataset.items.get(item_id="video-item-id")
builder = item.annotations.builder()

# Add annotations frame by frame
for i_row, row in df.iterrows():
    builder.add(
        annotation_definition=dl.Box(
            top=row["top"],
            left=row["left"],
            bottom=row["bottom"],
            right=row["right"],
            label=row["label"]
        ),
        object_visible=row["visible"],  # Handle visibility
        object_id=row["annotation id"],  # Track same object across frames
        frame_num=row["frame"]
    )

# Upload all annotations
item.annotations.upload(annotations=builder)

Audio Annotations 🎵

# Get your audio file
item = dataset.items.get(filepath="/my_audio.mp4")

# Create annotations using builder
builder = item.annotations.builder()
builder.add(
    annotation_definition=dl.Subtitle(label="speech", text="Hello world"),
    start_time="00:00:01",
    end_time="00:00:05"
)

# Add multiple segments
builder.add(
    annotation_definition=dl.Subtitle(label="music", text="Background music"),
    start_time="00:00:06",
    end_time="00:00:10"
)

# Upload annotations
item.annotations.upload(builder)

Batch Operations 📊

Copy Annotations Between Items

# Get source and target items
source_item = dataset.items.get(item_id="source-id")
target_item = dataset.items.get(item_id="target-id")

# Copy all annotations
target_item.annotations.upload(source_item.annotations.list())

Upload from Local JSON

# Load annotations from JSON file
annotations = dl.AnnotationCollection.from_json_file(
    filepath=r"/home/project/annotations.json"
)

# Upload to item
item = dataset.items.get(item_id="target-item-id")
item.annotations.upload(annotations=annotations)

Downloading Annotations 📥

Multiple Format Downloads

You can download annotations in various formats:

# Download in multiple formats
dataset.download(
    local_path=r"C:/downloads",
    annotation_options=[
        dl.VIEW_ANNOTATION_OPTIONS_MASK,
        dl.VIEW_ANNOTATION_OPTIONS_JSON,
        dl.ViewAnnotationOptions.INSTANCE
    ]
)

Filtered Downloads

Download specific annotations based on filters:

# Filter for specific items
item_filters = dl.Filters(resource="items", field="dir", values="/specific_folder")

# Filter for specific annotations
annotation_filters = dl.Filters(
    resource=dl.FiltersResource.ANNOTATION,
    field="label",
    values="desired_label"
)

# Download with filters
dataset.download(
    local_path=r"C:/filtered_downloads",
    filters=item_filters,
    annotation_filters=annotation_filters,
    annotation_options=dl.VIEW_ANNOTATION_OPTIONS_JSON
)

Format Conversion 🔄

Want to convert annotations between different formats? We've got you covered! First, grab our handy converter toolkit:

  1. Install our dtlpy-converters package 🛠️
pip install git+https://github.com/dataloop-ai-apps/dtlpy-converters
  1. Let's start converting! 🚀

Converting TO Dataloop Format ⬇️

Here's how to bring your COCO/YOLO/VOC annotations into Dataloop:

import asyncio
import dtlpy as dl
from dtlpyconverters.uploaders import ConvertersUploader

# Initialize our magical converter
converter = ConvertersUploader()

# 🎯 COCO to Dataloop
coco_dataset = dl.datasets.get(dataset_id="dataset_id")
asyncio.run(converter.coco_to_dataloop(
    dataset=coco_dataset,
    input_items_path=r"C:/path/to/coco/items",
    input_annotations_path=r"C:/path/to/coco/items/annotations",
    # Make sure item filenames match the COCO json! 🎯
    coco_json_filename="annotations.json",
    annotation_options=[
        dl.AnnotationType.BOX,
        dl.AnnotationType.SEGMENTATION
    ],
    upload_items=True,
    to_polygon=True
))

# 🎯 YOLO to Dataloop
yolo_dataset = dl.datasets.get(dataset_id="dataset_id")
asyncio.run(converter.yolo_to_dataloop(
    dataset=yolo_dataset,
    input_items_path=r"C:/path/to/yolo/items",
    # Make sure item filenames match YOLO txt files! 🎯
    input_annotations_path=r"C:/path/to/yolo/items/annotations",
    upload_items=True,
    add_labels_to_recipe=True,
    labels_txt_filepath=r"C:/path/to/yolo/items/labels/labels.txt"
))

# 🎯 VOC to Dataloop
voc_dataset = dl.datasets.get(dataset_id='dataset_id')
asyncio.run(converter.voc_to_dataloop(
    dataset=voc_dataset,
    input_items_path=r"C:/path/to/voc/items",
    # Make sure item filenames match VOC xml files! 🎯
    input_annotations_path=r"C:/path/to/voc/items/annotations",
    upload_items=True,
    add_labels_to_recipe=True
))

Converting FROM Dataloop Format ⬆️

Need to export your Dataloop annotations to other formats? Here's how:

import asyncio
import dtlpy as dl
from dtlpyconverters import coco_converters, yolo_converters, voc_converters

# Set up your filters (optional but powerful!) 🎯
filters = dl.Filters()
# Example: Get items from specific folder
filters.add(field=dl.FiltersKnownFields.DIR, values='/dog_name')
# Example: Filter for dog annotations
filters.add_join(field=dl.FiltersKnownFields.LABEL, values='dog')

# 🎯 Dataloop to COCO
coco_dataset = dl.datasets.get(dataset_id='')
coco_converter = coco_converters.DataloopToCoco(
    input_annotations_path=r'C:/input/coco',
    output_annotations_path=r'C:/output/coco',
    download_annotations=True,
    filters=filters,
    dataset=coco_dataset
)
asyncio.run(coco_converter.convert_dataset())

# 🎯 Dataloop to YOLO
yolo_dataset = dl.datasets.get(dataset_id='')
yolo_converter = yolo_converters.DataloopToYolo(
    input_annotations_path=r'C:/input/yolo',
    output_annotations_path=r'C:/output/yolo',
    download_annotations=True,
    filters=filters,
    dataset=yolo_dataset
)
asyncio.run(yolo_converter.convert_dataset())

# 🎯 Dataloop to VOC
voc_dataset = dl.datasets.get(dataset_id='')
voc_converter = voc_converters.DataloopToVoc(
    input_annotations_path=r'C:/input/voc',
    output_annotations_path=r'C:/output/voc',
    download_annotations=True,
    filters=filters,
    dataset=voc_dataset
)
asyncio.run(voc_converter.convert_dataset())

Pro Tips! 💡

  • Always check that your item filenames match the annotation files
  • Use filters to convert specific subsets of your data
  • Remember that converter functions are async - use asyncio.run()!

Best Practices for Large Scale Operations 🎯

Error Handling 🛡️

Error Handling: Always include error handling for large operations

try:
    items = dataset.items.upload(local_path=large_batch)
except dl.exceptions.PlatformException as e:
    print(f"Platform error: {e}")

Performance Optimization 🚀

Batch Processing: Group operations for better performance

# Example: Batch upload with progress tracking
items_batch = []
with tqdm.tqdm(total=len(file_list)) as pbar:
    for i, file_path in enumerate(file_list):
        items_batch.append({
            'local_path': file_path,
            'remote_name': f'processed_{i}.jpg'
        })
        if len(items_batch) == 100:  # Process in batches of 100
            dataset.items.upload(local_path=pd.DataFrame(items_batch))
            items_batch = []
            pbar.update(100)

Progress Tracking: Use progress bars for long operations

# Track progress for any operation
def process_with_progress(items):
    with tqdm.tqdm(total=len(items)) as pbar:
        for item in items:
            # Your processing logic here
            process_item(item)
            pbar.update(1)

Parallel Processing 🔄

  1. Multi-threading: Use parallel processing for large datasets
    from concurrent.futures import ThreadPoolExecutor
    
    def process_in_parallel(items, max_workers=32):
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = []
            for item in items:
                future = executor.submit(process_item, item)
                futures.append(future)
            
            # Wait for all tasks to complete
            for future in futures:
                future.result()

Logging 📝

Logging: Maintain detailed logs for debugging

import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def process_with_logging(item):
    try:
        logger.info(f"Processing item: {item.id}")
        # Your processing logic here
        logger.info(f"Successfully processed item: {item.id}")
    except Exception as e:
        logger.error(f"Error processing item {item.id}: {str(e)}")
        raise

Need More Help? 🤔

Happy coding! 🚀