Query individual files#

Here, we’ll query individual files and inspect their metadata.

This guide can be skipped if you are only interested in how to leverage the overall dataset.

import lamindb as ln
import lnschema_bionty as lb
import anndata as ad

💡 loaded instance: testuser1/test-scrna (lamindb 0.54.4)

ln.track()

💡 notebook imports: anndata==0.9.2 lamindb==0.54.4 lnschema_bionty==0.31.2

💡 Transform(id='agayZTonayqAz8', name='Query individual files', short_name='scrna2', version='0', type=notebook, updated_at=2023-10-01 16:43:24, created_by_id='DzTjkKse')

💡 Run(id='dMp7djqemZGyTyWkYRse', run_at=2023-10-01 16:43:24, transform_id='agayZTonayqAz8', created_by_id='DzTjkKse')

Access #

Query files by provenance metadata#

users = ln.User.lookup()

ln.Transform.filter(created_by=users.testuser1).search("scrna")

	id	__ratio__
name
scRNA-seq	Nv48yAceNSh8z8	90.0
Append a new batch of data	ManDYgmftZ8Cz8	36.0
Query individual files	agayZTonayqAz8	36.0

transform = ln.Transform.filter(id="Nv48yAceNSh8z8").one()

ln.File.filter(transform=transform).df()

	storage_id	key	suffix	accessor	description	version	size	hash	hash_type	transform_id	run_id	initial_version_id	updated_at	created_by_id
id
vAk7VTi8De3y0rT7H8u9	mw7lxLgT	None	.h5ad	AnnData	Conde22	None	28049505	WEFcMZxJNmMiUOFrcSTaig	md5	Nv48yAceNSh8z8	QbgMTXxxJxPHD0ovcimm	None	2023-10-01 16:42:46	DzTjkKse

Query files based on biological metadata#

assays = lb.ExperimentalFactor.lookup()
species = lb.Species.lookup()
cell_types = lb.CellType.lookup()

query = ln.File.filter(
    experimental_factors=assays.single_cell_rna_sequencing,
    species=species.human,
    cell_types=cell_types.gamma_delta_t_cell,
)

query.df()

	storage_id	key	suffix	accessor	description	version	size	hash	hash_type	transform_id	run_id	initial_version_id	updated_at	created_by_id
id
tKBg5wZP51aKaLiPXbGK	mw7lxLgT	None	.h5ad	AnnData	10x reference adata	None	660792	a2V0IgOjMRHsCeZH169UOQ	md5	ManDYgmftZ8Cz8	k9J7cQZqsnA5sK7oIO1S	None	2023-10-01 16:43:12	DzTjkKse
vAk7VTi8De3y0rT7H8u9	mw7lxLgT	None	.h5ad	AnnData	Conde22	None	28049505	WEFcMZxJNmMiUOFrcSTaig	md5	Nv48yAceNSh8z8	QbgMTXxxJxPHD0ovcimm	None	2023-10-01 16:42:46	DzTjkKse

Transform #

Compare gene sets#

Get file objects:

query = ln.File.filter()

file1, file2 = query.list()

file1.describe()

File(id='vAk7VTi8De3y0rT7H8u9', suffix='.h5ad', accessor='AnnData', description='Conde22', size=28049505, hash='WEFcMZxJNmMiUOFrcSTaig', hash_type='md5', updated_at=2023-10-01 16:42:46)

Provenance:
  🗃️ storage: Storage(id='mw7lxLgT', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-10-01 16:42:00, created_by_id='DzTjkKse')
  📔 transform: Transform(id='Nv48yAceNSh8z8', name='scRNA-seq', short_name='scrna', version='0', type='notebook', updated_at=2023-10-01 16:42:46, created_by_id='DzTjkKse')
  👣 run: Run(id='QbgMTXxxJxPHD0ovcimm', run_at=2023-10-01 16:42:05, transform_id='Nv48yAceNSh8z8', created_by_id='DzTjkKse')
  👤 created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-10-01 16:42:00)
  ⬇️ input_of (core.Run): ['2023-10-01 16:42:52']
Features:
  var: FeatureSet(id='NbpG08k5MyhtFXpSVTO8', n=36503, type='number', registry='bionty.Gene', hash='dnRexHCtxtmOU81_EpoJ', updated_at=2023-10-01 16:42:37, modality_id='ea9UjVtc', created_by_id='DzTjkKse')
    'FILIP1', 'PTGES3L', 'PIGA', 'PSKH1', 'TONSL-AS1', 'None', 'None', 'None', 'IGLV2-8', 'FOXN3', 'None', 'None', 'None', 'LINC03007', 'ASTN1', 'RASSF5', 'TMEM220-AS1', 'LMO7DN', 'None', 'None', ...
  obs: FeatureSet(id='eqbPhBQrbb7AoBWu5XY8', n=4, registry='core.Feature', hash='BUgwklP7zIs3Qx8yNIJ1', updated_at=2023-10-01 16:42:41, modality_id='FqWJG3xl', created_by_id='DzTjkKse')
    🔗 assay (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 5' v1', '10x 3' v3', '10x 5' v2'
    🔗 donor (12, core.ULabel): '582C', 'A52', 'A37', 'D496', '640C', '621B', 'A29', 'A35', 'A36', '637C', ...
    🔗 tissue (17, bionty.Tissue): 'thoracic lymph node', 'lung', 'duodenum', 'bone marrow', 'blood', 'omentum', 'transverse colon', 'caecum', 'mesenteric lymph node', 'lamina propria', ...
    🔗 cell_type (32, bionty.CellType): 'alpha-beta T cell', 'alveolar macrophage', 'macrophage', 'progenitor cell', 'CD4-positive helper T cell', 'lymphocyte', 'effector memory CD4-positive, alpha-beta T cell', 'plasmacytoid dendritic cell', 'CD16-negative, CD56-bright natural killer cell, human', 'classical monocyte', ...
Labels:
  🏷️ species (1, bionty.Species): 'human'
  🏷️ tissues (17, bionty.Tissue): 'thoracic lymph node', 'lung', 'duodenum', 'bone marrow', 'blood', 'omentum', 'transverse colon', 'caecum', 'mesenteric lymph node', 'lamina propria', ...
  🏷️ cell_types (32, bionty.CellType): 'alpha-beta T cell', 'alveolar macrophage', 'macrophage', 'progenitor cell', 'CD4-positive helper T cell', 'lymphocyte', 'effector memory CD4-positive, alpha-beta T cell', 'plasmacytoid dendritic cell', 'CD16-negative, CD56-bright natural killer cell, human', 'classical monocyte', ...
  🏷️ experimental_factors (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 5' v1', '10x 3' v3', '10x 5' v2'
  🏷️ ulabels (12, core.ULabel): '582C', 'A52', 'A37', 'D496', '640C', '621B', 'A29', 'A35', 'A36', '637C', ...

file1.view_flow()

https://d33wubrfki0l68.cloudfront.net/1507240f0e9d765104ea745a29deb49b6f4bad1c/488db/_images/d3a042c23f9fe889ae3909beb7bb7e21d4fd406842ad7d2ede8891e27a6ab1d7.svg

file2.describe()

File(id='tKBg5wZP51aKaLiPXbGK', suffix='.h5ad', accessor='AnnData', description='10x reference adata', size=660792, hash='a2V0IgOjMRHsCeZH169UOQ', hash_type='md5', updated_at=2023-10-01 16:43:12)

Provenance:
  🗃️ storage: Storage(id='mw7lxLgT', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-10-01 16:42:00, created_by_id='DzTjkKse')
  📔 transform: Transform(id='ManDYgmftZ8Cz8', name='Append a new batch of data', short_name='scrna1', version='0', type='notebook', updated_at=2023-10-01 16:43:13, created_by_id='DzTjkKse')
  👣 run: Run(id='k9J7cQZqsnA5sK7oIO1S', run_at=2023-10-01 16:42:52, transform_id='ManDYgmftZ8Cz8', created_by_id='DzTjkKse')
  👤 created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-10-01 16:42:00)
Features:
  var: FeatureSet(id='2wFhGDUjuAeoU8jTR3XH', n=754, type='number', registry='bionty.Gene', hash='WMDxN7253SdzGwmznV5d', updated_at=2023-10-01 16:43:12, modality_id='ea9UjVtc', created_by_id='DzTjkKse')
    'NSUN6', 'S100A6', 'LAG3', 'EIF2AK1', 'DHRS4L2', 'LSM5', 'G0S2', 'CCDC107', 'PSMD7', 'HNRNPF', 'EIF3G', 'PSMC5', 'HLA-DMA', 'MFSD14B', 'OSBPL8', 'CD63', 'DHRS7', 'PNN', 'MRPS33', 'LYPD2', ...
  obs: FeatureSet(id='jugdraH3kleI6wzGVAaT', n=1, registry='core.Feature', hash='QFilx9ah7bacDSHBYJOD', updated_at=2023-10-01 16:43:12, modality_id='FqWJG3xl', created_by_id='DzTjkKse')
    🔗 cell_type (9, bionty.CellType): 'CD24-positive, CD4 single-positive thymocyte', 'B cell, CD19-positive', 'CD16-positive, CD56-dim natural killer cell, human', 'monocyte', 'gamma-delta T cell', 'cytotoxic T cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'dendritic cell', 'CD4-positive, alpha-beta T cell'
  external: FeatureSet(id='P1bXXRGZjslUlZiURh3N', n=2, registry='core.Feature', hash='pzAiye3Tiav8qFggirus', updated_at=2023-10-01 16:43:12, modality_id='FqWJG3xl', created_by_id='DzTjkKse')
    🔗 species (1, bionty.Species): 'human'
    🔗 assay (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'
Labels:
  🏷️ species (1, bionty.Species): 'human'
  🏷️ cell_types (9, bionty.CellType): 'CD24-positive, CD4 single-positive thymocyte', 'B cell, CD19-positive', 'CD16-positive, CD56-dim natural killer cell, human', 'monocyte', 'gamma-delta T cell', 'cytotoxic T cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'dendritic cell', 'CD4-positive, alpha-beta T cell'
  🏷️ experimental_factors (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'

file2.view_flow()

https://d33wubrfki0l68.cloudfront.net/fffbdb3ea3163d8b4bb91e7d92f87c9922efe339/ced05/_images/ff1c122dc5eeb1634683e2d1022ee87a707ba73b610226e5bf16d4045be7da16.svg

Load files into memory:

file1_adata = file1.load()
file2_adata = file2.load()

Here we compute shared genes without loading files:

file1_genes = file1.features["var"]
file2_genes = file2.features["var"]

shared_genes = file1_genes & file2_genes
len(shared_genes)

shared_genes.list("symbol")[:10]

['RCSD1',
 'SLTM',
 'MTCH2',
 'ITM2A',
 'STUB1',
 'ILF3-DT',
 'SNX2',
 'TOMM7',
 'ZNF106',
 'TXN']

Compare cell types#

file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()

shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names

['CD16-positive, CD56-dim natural killer cell, human', 'gamma-delta T cell']

We can now subset the two datasets by shared cell types:

file1_adata_subset = file1_adata[
    file1_adata.obs["cell_type"].isin(shared_celltypes_names)
]

file2_adata_subset = file2_adata[
    file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]

Concatenate subsetted datasets:

adata_concat = ad.concat(
    [file1_adata_subset, file2_adata_subset],
    label="file",
    keys=[file1.description, file2.description],
)
adata_concat

AnnData object with n_obs × n_vars = 187 × 749
    obs: 'cell_type', 'file'
    obsm: 'X_umap'

adata_concat.obs.value_counts()

cell_type                                           file               
CD16-positive, CD56-dim natural killer cell, human  Conde22                114
gamma-delta T cell                                  Conde22                 66
                                                    10x reference adata      4
CD16-positive, CD56-dim natural killer cell, human  10x reference adata      3
dtype: int64