Data Management Fundamentals 💾
Master the essentials of managing your data in Dataloop - from uploading files to organizing your datasets efficiently.
Uploading Items 📤
1. Single Item Upload
import dtlpy as dl
# Upload a single file
item = dataset.items.upload(
local_path='/path/to/image.jpg',
remote_path='/folder/in/dataset' # Optional
)
# Upload with metadata
item = dataset.items.upload(
local_path='/path/to/image.jpg',
metadata={'photographer': 'John Doe',
'location': 'New York'}
)
2. Batch Upload
# Upload entire directory
dataset.items.upload(
local_path='/path/to/folder',
local_annotations_path='/path/to/annotations' # Optional
)
# Upload multiple specific files
items = dataset.items.upload(
local_path=['/path/1.jpg', '/path/2.jpg'],
remote_path='/batch-upload'
)
File Organization Strategies 📂
1. Directory Structure
# Move items
item.move(
new_path='/new/path/item.jpg'
)
# List directory contents
pages = dataset.items.list(filters=dl.Filters(field='dir', values='/folder'))
2. Item Management
# Get item by ID or filename
item = dataset.items.get(item_id='item_id')
item = dataset.items.get(filepath='/images/item.jpg')
# Update item
item.name = 'new_name.jpg'
item.update()
# Delete items
dataset.items.delete(filters=dl.Filters(field='dir', values='/old/folder'))
3. Metadata Organization
# Add metadata to item
item.metadata['user'] = {
'status': 'reviewed',
'quality': 'high',
'tags': ['validated', 'ready']
}
item = item.update()
# Batch metadata update
filters = dl.Filters(field='dir', values='/folder')
dataset.items.update(
filters=filters,
update_values={'user.status': 'reviewed'}
)
Batch Operations ⚡
1. Bulk Upload and Download
# Bulk upload
dataset.items.upload(
local_path='/path/to/folder',
remote_path='/dataset/folder'
)
# Bulk download
dataset.items.download(
local_path='/local/folder',
filters=dl.Filters(field='dir', values='/dataset/folder')
)
2. Batch Processing
# Process multiple items
filters = dl.Filters(field='dir', values='/folder')
pages = dataset.items.list(filters=filters)
for page in pages:
for item in page:
# Process each item
process_item(item)
# Batch delete
dataset.items.delete(filters=filters)
3. Concurrent Operations
import concurrent.futures
def process_batch(items):
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(process_item, item) for item in items]
concurrent.futures.wait(futures)
Data Validation 🔍
1. Item Validation
# Check item existence
try:
item = dataset.items.get(filepath='/path/to/item.jpg')
except dl.exceptions.NotFound:
print("Item not found!")
# Validate item metadata
def validate_item(item):
required_fields = ['status', 'quality']
metadata = item.metadata.get('user', {})
return all(field in metadata for field in required_fields)
2. Batch Validation
# Validate multiple items
def validate_items(dataset):
invalid_items = []
for item in dataset.items.list().all():
if not validate_item(item):
invalid_items.append(item.id)
return invalid_items
3. Data Integrity Checks
# Check for corrupted images
def check_image_integrity(item):
try:
buffer = item.download(save_locally=False)
Image.open(buffer)
return True
except Exception as e:
print(f"Corrupted image {item.name}: {e}")
return False
Best Practices 👑
1. File Organization
- Use consistent folder structures
- Implement clear naming conventions
- Keep related files together
- Document organization schema
32. Error Handling
def safe_upload(path):
try:
item = dataset.items.upload(local_path=path, raise_on_error=True)
return True, item
except Exception as e:
return False, str(e)
Pro Tips 💡
Efficient Data Organization
# Create organized folder structure folders = ['train', 'val', 'test'] for folder in folders: dataset.directory_tree.create(directory=f'/{folder}')
Bulk Operations with Progress
from tqdm import tqdm def upload_with_progress(files): for file in tqdm(files, desc="Uploading"): dataset.items.upload(local_path=file)
Troubleshooting Guide 🔧
Common Issues:
Upload Failures
# Retry mechanism def upload_with_retry(path, max_retries=3): for attempt in range(max_retries): try: return dataset.items.upload(local_path=path, raise_on_error=True) except Exception as e: if attempt == max_retries - 1: raise e time.sleep(2 ** attempt) # Exponential backoff
Download Issues
# Handle download errors try: item.download(local_path='path/to/save') except dl.exceptions.ItemNotFound: print("Item not found!") except dl.exceptions.ConnectionError: print("Connection error - retrying...")
Ready to start annotating your data? Let's move on to the annotation chapter! 🎯