Quickstart

# use a local instance for testing
!lamin init --storage ./test-alo --schema bionty,findrefs,alo
→ connected lamindb: testuser1/test-alo
import lamindb as ln
import bionty as bt
import findrefs as fr
import alo
import pandas as pd
→ connected lamindb: testuser1/test-alo

regex validation:

from lnschema_core.validation import FieldValidationError

try:
    alo.BenchlingEntry(uid="123")
except FieldValidationError as e:
    print(e)
  uid: 123 is not valid
    → Must be of the form 'EXP1234'.
  

email validation:

try:
    person = alo.Person(uid="person:1", name="John Doe", email_address="abc")
except FieldValidationError as e:
    print(e)
  email_address: abc is not valid
    → Enter a valid email address.
  
alo.Person(uid="person:1", name="John Doe", email_address="john.doe@gmail.com").save()
Person(uid='person:1', name='John Doe', email_address='john.doe@gmail.com', created_by_id=1, created_at=2024-11-21 12:41:35 UTC)

enum validation:

try:
    alo.Person(uid="person:2", name="Jane Doe", email_address="jane.doe@gmail.com", role="new role")
except FieldValidationError as e:
    print(e)
  role: new role is not a valid
    → Valid values are: PI, analyst, requester, scientist
  

Pre-populate some records:

from typing import get_args
from alo.types import ModalityType

# pre-populate modality enums as ULabels
modalities = [ln.ULabel(name=i) for i in get_args(ModalityType)]
ln.save(modalities)
modality = ln.ULabel(name="modality").save()
modality.children.set(modalities)

# pre-populate phenotypes
bt.Phenotype.import_source(bt.Source.filter(name="pato").first())

# pre-populate references
fr.Reference(name="my reference", doi="https://doi.org/10.1234/123456").save()

# pre-populate samples
alo.Sample(uid="sample:123", name="Sample 123").save()
alo.Sample(uid="sample:456", name="Sample 456").save();

Curate SingleValueMIADPackage

df = pd.DataFrame(
    {
        "assay": ["RNA-seq", "ATAC-seq"],
        "benchling_identifier": ["EXP1235", "EXP5678"],
        "biological_sex": ["female", "male"],
        "cell_line": ["K562", "HEK293"],
        "cell_type": ["B cell", "T cell"],
        "contact": ["John Doe", "John Doe"],
        "disease": ["RAB18 deficiency", "RAB18 deficiency"],
        "developmental_stage": ["embryonic stage", "embryonic stage"],
        "doi": ["https://doi.org/10.1234/123456", "https://doi.org/10.1234/123456"],
        "modality": ["unknown", "Genomics"],
        "sample_accession": ["sample:123", "sample:456"],
        "species": ["human", "human"],
        "tissue": ["blood", "brain"],
    }
)

df
assay benchling_identifier biological_sex cell_line cell_type contact disease developmental_stage doi modality sample_accession species tissue
0 RNA-seq EXP1235 female K562 B cell John Doe RAB18 deficiency embryonic stage https://doi.org/10.1234/123456 unknown sample:123 human blood
1 ATAC-seq EXP5678 male HEK293 T cell John Doe RAB18 deficiency embryonic stage https://doi.org/10.1234/123456 Genomics sample:456 human brain
SingleValueMIADPackage_criteria = {
    "assay": bt.ExperimentalFactor.name,
    "benchling_identifier": alo.BenchlingEntry.uid,
    "biological_sex": bt.Phenotype.name,
    "cell_line": bt.CellLine.name,
    "cell_type": bt.CellType.name,
    "contact": alo.Person.name,
    "disease": bt.Disease.name,
    "developmental_stage": bt.DevelopmentalStage.name,
    "doi": fr.Reference.doi,
    "modality": ln.ULabel.name,
    "sample_accession": alo.Sample.uid,
    "species": bt.Organism.name,
    "tissue": bt.Tissue.name,
}

curator = ln.Curator.from_df(
    df, 
    categoricals=SingleValueMIADPackage_criteria,
)
✓ added 13 records with Feature.name for columns: 'assay', 'benchling_identifier', 'biological_sex', 'cell_line', 'cell_type', 'contact', 'disease', 'developmental_stage', 'doi', 'modality', 'sample_accession', 'species', 'tissue'
curator.validate()
• saving validated records of 'assay'
• saving validated records of 'cell_line'
• saving validated records of 'cell_type'
• saving validated records of 'disease'
• saving validated records of 'developmental_stage'
• saving validated records of 'species'
• saving validated records of 'tissue'
✓ 'assay' is validated against ExperimentalFactor.name
• mapping benchling_identifier on BenchlingEntry.uid
!    2 terms are not validated: 'EXP1235', 'EXP5678'
→ fix typos, remove non-existent values, or save terms via .add_new_from('benchling_identifier')
✓ 'biological_sex' is validated against Phenotype.name
✓ 'cell_line' is validated against CellLine.name
✓ 'cell_type' is validated against CellType.name
✓ 'contact' is validated against Person.name
✓ 'disease' is validated against Disease.name
✓ 'developmental_stage' is validated against DevelopmentalStage.name
✓ 'doi' is validated against Reference.doi
✓ 'modality' is validated against ULabel.name
✓ 'sample_accession' is validated against Sample.uid
✓ 'species' is validated against Organism.name
✓ 'tissue' is validated against Tissue.name
False
curator.add_new_from('benchling_identifier')
✓ added 2 records with BenchlingEntry.uid for benchling_identifier: 'EXP5678', 'EXP1235'
artifact = curator.save_artifact(description="my first SingleValueMIADPackage artifact")
Hide code cell output
✓ 'assay' is validated against ExperimentalFactor.name
✓ 'benchling_identifier' is validated against BenchlingEntry.uid
✓ 'biological_sex' is validated against Phenotype.name
✓ 'cell_line' is validated against CellLine.name
✓ 'cell_type' is validated against CellType.name
✓ 'contact' is validated against Person.name
✓ 'disease' is validated against Disease.name
✓ 'developmental_stage' is validated against DevelopmentalStage.name
✓ 'doi' is validated against Reference.doi
✓ 'modality' is validated against ULabel.name
✓ 'sample_accession' is validated against Sample.uid
✓ 'species' is validated against Organism.name
✓ 'tissue' is validated against Tissue.name
! no run & transform got linked, call `ln.track()` & re-run
! run input wasn't tracked, call `ln.track()` and re-run
artifact.describe()
Artifact(uid='YhqreC34G8ZdFG1x0000', is_latest=True, description='my first SingleValueMIADPackage artifact', suffix='.parquet', type='dataset', size=8158, hash='byPEZgHFD3ihiS18fmrELQ', _hash_type='md5', _accessor='DataFrame', visibility=1, _key_is_virtual=True, created_at=2024-11-21 12:41:43 UTC)
  Provenance
    .storage = '/home/runner/work/alo/alo/docs/test-alo'
    .created_by = 'testuser1'
  Labels
    .organisms = 'human'
    .tissues = 'brain', 'blood'
    .cell_types = 'B cell', 'T cell'
    .diseases = 'RAB18 deficiency'
    .cell_lines = 'HEK293', 'K 562 cell'
    .phenotypes = 'female', 'male'
    .experimental_factors = 'ATAC-seq', 'RNA-Seq'
    .developmental_stages = 'embryonic stage'
    .ulabels = 'unknown', 'Genomics'
  Features
    'assay' = 'ATAC-seq', 'RNA-Seq'
    'biological_sex' = 'female', 'male'
    'cell_line' = 'HEK293', 'K 562 cell'
    'cell_type' = 'B cell', 'T cell'
    'developmental_stage' = 'embryonic stage'
    'disease' = 'RAB18 deficiency'
    'modality' = 'Genomics', 'unknown'
    'species' = 'human'
    'tissue' = 'blood', 'brain'
  Feature sets
    'columns' = 'assay', 'benchling_identifier', 'biological_sex', 'cell_line', 'cell_type', 'contact', 'disease', 'developmental_stage', 'doi', 'modality', 'sample_accession', 'species', 'tissue'