Mercurial > hg > orthanc-databases
view NOTES @ 523:9413451fd984 large-queries
ExecuteSetResourcesContentTags is now using a cached prepared statement
author | Alain Mazy <am@orthanc.team> |
---|---|
date | Tue, 09 Jul 2024 16:21:25 +0200 |
parents | bf4b9c7cf338 |
children |
line wrap: on
line source
Resources: ********* - PG transaction modes explained: https://www.postgresql.org/files/developer/concurrency.pdf - Isolation level explained (PG + MySQL): https://amirsoleimani.medium.com/understanding-database-isolation-level-via-examples-mysql-and-postgres-a86b5502d404 - Message queuing in PG: https://www.crunchydata.com/blog/message-queuing-using-native-postgresql Create and delete instances Internals: ************************************* isNewInstance = CreateInstance(...) if (!isNewInstance && overwriteInstances) DeleteResource(instance) -> ClearDeletedFiles(manager); DELETE FROM DeletedFiles ------> this is not a temporary table in postgres but it is supposed to be empty before and after each transaction -> it is empty when taking a snapshot of the DB in READ COMMITTED mode!!! ClearDeletedResources(manager); DELETE FROM DeletedResources ------> this is not a temporary table in postgres but it is supposed to be empty before and after each transaction !!! DELETE FROM RemainingAncestor ------> this is not a temporary table in postgres but it is supposed to be empty before and after each transaction !!! DELETE FROM Resources WHERE internalId=.. -> cascades delete the MainDicomTags, the Metadata and the AttachedFiles -> this triggers AttachedFileDeletedFunc INSERT INTO DeletedFiles VALUES (old.uuid, old.filetype, old.compressedSize, old.uncompressedSize, old.compressionType, old.uncompressedHash, old.compressedHash); RETURN NULL; -> this triggers a SQL trigger: ResourceDeletedFunc INSERT INTO DeletedResources VALUES (old.resourceType, old.publicId); IF EXISTS (SELECT 1 FROM Resources WHERE parentId = old.parentId) THEN -- Signal that the deleted resource has a remaining parent -- (a parent that must not be deleted but whose LastUpdate must be updated) INSERT INTO RemainingAncestor SELECT resourceType, publicId FROM Resources WHERE internalId = old.parentId; ELSE -- Delete a parent resource when its unique child is deleted DELETE FROM Resources WHERE internalId = old.parentId; END IF; SELECT * FROM RemainingAncestor -> SignalRemainingAncestor() // There is at most 1 remaining ancestor -> ServerIndex::TransactionContext::SignalRemainingAncestor() -> stores remainingType and remainingPublicId (this is used in StatelessDatabaseOperations::DeleteResource to build the Rest Response of /delete and to update the LastUpdate of all parent (only when deleted from /delete)) SignalDeletedFiles(output, manager); SELECT * FROM DeletedFiles -> SignalDeletedAttachment() -> ServerIndex::TransactionContext::SignalAttachmentDeleted() -> pendingFilesToRemove_.push_back(FileToRemove(info)) (files are deleted in CommitFilesToRemove in the ServerIndex::TransactionContext::Commit) SignalDeletedResources(output, manager); SELECT resourceType, publicId FROM DeletedResources -> SignalDeletedResource() -> Emit DeletedResource event (lua) if (!CreateInstance(...)) Error: "No new instance while overwriting; this should not happen" if isNewInstance -> LogChange if isNewSeries -> LogChange .... Sample SQL code that you can execute in DBeaver to test new functions/procedures: CreateInstance ************************************************************************ CREATE OR REPLACE FUNCTION CreateInstance( IN patient_public_id TEXT, IN study_public_id TEXT, IN series_public_id TEXT, IN instance_public_id TEXT, OUT is_new_patient BIGINT, OUT is_new_study BIGINT, OUT is_new_series BIGINT, OUT is_new_instance BIGINT, OUT patient_internal_id BIGINT, OUT study_internal_id BIGINT, OUT series_internal_id BIGINT, OUT instance_internal_id BIGINT) AS $body$ BEGIN is_new_patient := 1; is_new_study := 1; is_new_series := 1; is_new_instance := 1; BEGIN INSERT INTO "resources" VALUES (DEFAULT, 0, patient_public_id, NULL); EXCEPTION WHEN unique_violation THEN is_new_patient := 0; END; SELECT internalid INTO patient_internal_id FROM "resources" WHERE publicId = patient_public_id AND resourcetype = 0 FOR UPDATE; -- also locks the resource and its parent to prevent from deletion while we complete this transaction BEGIN INSERT INTO "resources" VALUES (DEFAULT, 1, study_public_id, patient_internal_id); EXCEPTION WHEN unique_violation THEN is_new_study := 0; END; SELECT internalid INTO study_internal_id FROM "resources" WHERE publicId = study_public_id AND resourcetype = 1 FOR UPDATE; -- also locks the resource and its parent to prevent from deletion while we complete this transaction BEGIN INSERT INTO "resources" VALUES (DEFAULT, 2, series_public_id, study_internal_id); EXCEPTION WHEN unique_violation THEN is_new_series := 0; END; SELECT internalid INTO series_internal_id FROM "resources" WHERE publicId = series_public_id AND resourcetype = 2 FOR UPDATE; -- also locks the resource and its parent to prevent from deletion while we complete this transaction BEGIN INSERT INTO "resources" VALUES (DEFAULT, 3, instance_public_id, series_internal_id); EXCEPTION WHEN unique_violation THEN is_new_instance := 0; END; SELECT internalid INTO instance_internal_id FROM "resources" WHERE publicId = instance_public_id AND resourcetype = 3 FOR UPDATE; -- also locks the resource and its parent to prevent from deletion while we complete this transaction IF is_new_instance > 0 THEN -- Move the patient to the end of the recycling order. PERFORM PatientAddedOrUpdated(patient_internal_id, 1); END IF; END; $body$ LANGUAGE plpgsql; DO $$ DECLARE result record; begin delete from "resources"; SELECT * INTO result from CreateInstance('patient1', 'study1', 'series1', 'instance1'); RAISE NOTICE 'Value patientInternalId: %, is_new: %', result.patient_internal_id, result.is_new_patient; RAISE NOTICE 'Value studyInternalId: %, is_new: %', result.study_internal_id, result.is_new_study; RAISE NOTICE 'Value seriesInternalId: %, is_new: %', result.series_internal_id, result.is_new_series; RAISE NOTICE 'Value instanceInternalId: %, is_new: %', result.instance_internal_id, result.is_new_instance; RAISE NOTICE '--------------'; SELECT * INTO result from CreateInstance('patient1', 'study1', 'series1', 'instance2'); RAISE NOTICE 'Value patientInternalId: %, is_new: %', result.patient_internal_id, result.is_new_patient; RAISE NOTICE 'Value studyInternalId: %, is_new: %', result.study_internal_id, result.is_new_study; RAISE NOTICE 'Value seriesInternalId: %, is_new: %', result.series_internal_id, result.is_new_series; RAISE NOTICE 'Value instanceInternalId: %, is_new: %', result.instance_internal_id, result.is_new_instance; RAISE NOTICE '--------------'; SELECT * INTO result from CreateInstance('patient1', 'study1', 'series1', 'instance2'); RAISE NOTICE 'Value patientInternalId: %, is_new: %', result.patient_internal_id, result.is_new_patient; RAISE NOTICE 'Value studyInternalId: %, is_new: %', result.study_internal_id, result.is_new_study; RAISE NOTICE 'Value seriesInternalId: %, is_new: %', result.series_internal_id, result.is_new_series; RAISE NOTICE 'Value instanceInternalId: %, is_new: %', result.instance_internal_id, result.is_new_instance; RAISE NOTICE '--------------'; END $$; -- \set patient_key 'patient_key' -- SELECT CreateInstance('patient', 'study', 'series', 'instance', patient_key) ; -- drop function CreateInstance -- select * from "resources"; -- delete from "resources"; -- INSERT INTO "resources" VALUES (DEFAULT, 0, 'patient', NULL) ************************************************************************ In debug, no verbose logs, 10 connections Orthanc 1.12.1 + PG 5.1 (serializable mode) : test_concurrent_anonymize_same_study with 4 workers and 10x repeat: 43.957 s Orthanc mainline + PG mainline (read-committed mode) : test_concurrent_anonymize_same_study with 4 workers and 10x repeat: 15.744 s test_concurrent_anonymize_same_study deletion took: 18.8 s Orthanc 1.12.1 + PG 5.1 (serializable mode) : test_concurrent_uploads_same_study with 20 workers and 1x repeat: 21.214 s Orthanc mainline + PG mainline (read-committed mode) : test_concurrent_uploads_same_study with 20 workers and 1x repeat: 9.514 s Orthanc 1.12.1 + PG 5.1 (serializable mode) : test_upload_delete_same_study_from_multiple_threads with 5 workers and 3x repeat (10x): 23.016 s Orthanc mainline + PG mainline (read-committed mode) : test_upload_delete_same_study_from_multiple_threads with 5 workers and 3x repeat (10x): 7.129 s Orthanc mainline + PG mainline (read-committed mode) : test_upload_multiple_studies_from_multiple_threads with 10 workers and 25 files and repeat 3x: 6.454 s With Docker with 10 connections SQL: osimis/orthanc:24.1.2 : test_concurrent_anonymize_same_study with 4 workers and 10x repeat: 18.956 s FAIL !!! test_concurrent_anonymize_same_study deletion took: NA osimis/orthanc:current: test_concurrent_anonymize_same_study with 4 workers and 10x repeat: 6.867 s test_concurrent_anonymize_same_study deletion took: 9.095 s osimis/orthanc:24.1.2 : test_concurrent_uploads_same_study with 20 workers and 1x repeat: 9.822 s osimis/orthanc:current: test_concurrent_uploads_same_study with 20 workers and 1x repeat: 16.027 s up to 38s ! (slower but the test is not representative of a real life scenario !!!!!) osimis/orthanc:24.1.2 : test_upload_delete_same_study_from_multiple_threads with 5 workers and 3x repeat (10x): 12.966 s osimis/orthanc:current: test_upload_delete_same_study_from_multiple_threads with 5 workers and 3x repeat (10x): 4.196 s osimis/orthanc:24.1.2 : test_upload_multiple_studies_from_multiple_threads with 10 workers and 25 files and repeat 3x: 8.957 s osimis/orthanc:current: test_upload_multiple_studies_from_multiple_threads with 10 workers and 25 files and repeat 3x: 2.671 s Testing the connecions (note: Orthanc and PG server running on the same server) 10 connections : test_concurrent_anonymize_same_study with 4 workers and 10x repeat: 15.744 s 1 connection : test_concurrent_anonymize_same_study with 4 workers and 10x repeat: 21.341 s 10 connections : test_concurrent_uploads_same_study with 20 workers and 1x repeat: 6.57 s 1 connection : test_concurrent_uploads_same_study with 20 workers and 1x repeat: 10.223 s 10 connections : test_upload_delete_same_study_from_multiple_threads with 5 workers and 3x repeat (10x): 7.129 s 1 connection : test_upload_delete_same_study_from_multiple_threads with 5 workers and 3x repeat (10x): 11.172 s LARGE QUERIES ------------- new extended-api-v1/tools/find requirements (note: extended-api-v1 is a code name:-) ): - be able to list child-resources like in /studies/.../instances?expand => we need a "ParentStudy" in the query. Note: a single "Parent" field is not enough since we sometimes want to specify a "Parent" that is 2 or 3 levels higher (e.g /patients/../instances) e.g: { "Level": "Instance", "Query": { "ParentStudy": "orthanc-study-id", }, } - be able to search on labels like in the current tools/find implementation => we need a "ParentStudy" in the query. Note: a single "Parent" field is not enough since we sometimes want to specify a "Parent" that is 2 or 3 levels higher (e.g /patients/../instances) e.g: { "Level": "Study", "Query": { "Labels": ["hello"], "LabelsConstraint" : "All", }, } - be able to select what are the expected fields in the response => we need a "ExpandContent" field in the query. If there are no "ExpandContent", we simply return a list of IDs e.g: { "Level": "Series", "Query": { "ParentStudy": "orthanc-study-id", }, "ExpandContent": [ "Children", // the list of orthanc-ids of the child of the resources (equivalent to e.g /series/../instances) "Parent", // the resource parent id (e.g. "ParentStudy" for a series) "Labels", // the resources labels (equivalent to e.g /series/../labels) "Metadata", // the resources metadata (equivalent to e.g. /series/../metadata?expand) "MainDicomTags", // the resources MainDicomTags "FileInfo", // "FileSize" + "FileUuid", applicable only to instances // TODO: what about "IsStable", "Status", "LastUpdate", "IndexInSeries", "ExpectedNumberOfInstances" ??? // TODO: right now, we have a "RequestedTags" option that can list the DICOM TAGS and tools/find can fetch the tags from disk if they are not in the ExtraMainDicomTags ] } - be able to filter on DICOM Tags (and compose with a Parent orthanc id) e.g: get all US series from this study { "Level": "Series", "Query": { "ParentStudy": "orthanc-study-id", "DicomTags": { "Modality": "US" } } } - be able to filter on a metadata (and compose with a Parent orthanc id) e.g: to get all PDF instances of a study { "Level": "Instance", "Query": { "ParentStudy": "orthanc-study-id", "Metadata": { "SopClassUid": "1.2.840.10008.5.1.4.1.1.104.1" } } } - be able to order results on one or more DicomTags or Metadata + limit results + offset (for pagination) e.g: to get page 5 of all studies from this month { "Level": "Study", "Query": { "DicomTags": { "StudyDate": "20240101-20240131" }, "OrderBy": [ { "DicomTag": "StudyDate", "Order": "Desc" }, { "Metadata": "LastUpdate", "Order": "Desc" } ], "Limit": 1000, "Since": 5000 } } current queries - single level: ------------------------------ Search: sql = ("SELECT publicId, internalId " "FROM Resources " "WHERE resourceType = " + formatter.FormatResourceType(queryLevel) + " "); if (dicomIdentifiersComparisons.size() > 0) sql += (" AND internalId IN (SELECT id FROM DicomIdentifiers WHERE " + *it + ") "); if (mainDicomTagsComparisons.size() > 0) sql += (" AND internalId IN (SELECT id FROM MainDicomTags WHERE " + *it + ") "); if (!labels.empty()) sql += (" AND internalId " + inOrNotIn + " (SELECT id" " FROM (SELECT id, COUNT(1) AS labelsCount " "FROM Labels " "WHERE label IN (" + Join(formattedLabels, "", ", ") + ") GROUP BY id" ") AS temp " " WHERE labelsCount " + condition + ")"); if (limit != 0) sql += " LIMIT " + boost::lexical_cast<std::string>(limit); Patients: "SELECT patients_studies.patients_public_id, MIN(instances.publicId) AS instances_public_id " "FROM (SELECT patients.publicId AS patients_public_id, MIN(studies.internalId) AS studies_internal_id " "FROM (" + sql + ") AS patients " "INNER JOIN Resources studies ON studies.parentId = patients.internalId " "GROUP BY patients.publicId " ") AS patients_studies " "INNER JOIN Resources series ON series.parentId = patients_studies.studies_internal_id " "INNER JOIN Resources instances ON instances.parentId = series.internalId " "GROUP BY patients_studies.patients_public_id" Studies: "SELECT studies_series.studies_public_id, MIN(instances.publicId) AS instances_public_id " "FROM (SELECT studies.publicId AS studies_public_id, MIN(series.internalId) AS series_internal_id " "FROM (" + sql + ") AS studies " "INNER JOIN Resources series ON series.parentId = studies.internalId " "GROUP BY studies.publicId " ") AS studies_series " "INNER JOIN Resources instances ON instances.parentId = studies_series.series_internal_id " "GROUP BY studies_series.studies_public_id" Series: "SELECT series.publicId AS series_public_id, MIN(instances.publicId) AS instances_public_id " "FROM (" + sql + ") AS series " "INNER JOIN Resources instances ON instances.parentId = series.internalId " "GROUP BY series.publicId " Instances: "SELECT instances.publicId, instances.publicId FROM (" + sql + ") instances" current queries - multi level: ------------------------------ Search: ISqlLookupFormatter::Apply New queries: ----------- - Get all series ids where PatientID (16, 32)="ID-DYL-WAL-00000008" StudyDescription (8, 4144) %LIKE% "000000" order by StudyDate(8, 32) DESC Current: curl http://localhost:8044/tools/find -d '{"Level": "Series", "Query": {"StudyDescription": "*000000*", "PatientID": "ID-DYL-WAL-00000008"}}' SELECT series.publicId, MIN(instances.publicId) FROM (SELECT series.publicId, series.internalId FROM Resources AS series INNER JOIN Resources studies ON studies.internalId=series.parentId INNER JOIN DicomIdentifiers t0 ON t0.id = studies.internalId AND t0.tagGroup = 16 AND t0.tagElement = 32 -- Patient tags are copied at study level ! INNER JOIN DicomIdentifiers t1 ON t1.id = studies.internalId AND t1.tagGroup = 8 AND t1.tagElement = 4144 WHERE series.resourceType = 2 AND t0.value = 'ID-DYL-WAL-00000008' AND t1.value like '%000000%' ESCAPE '\' LIMIT 101) series INNER JOIN Resources instances ON instances.parentId = series.internalId GROUP BY series.publicId With order by, we must not retrieve the instance anymore: SELECT series.publicId, series.internalId FROM Resources AS series INNER JOIN Resources studies ON studies.internalId=series.parentId INNER JOIN DicomIdentifiers t0 ON t0.id = studies.internalId AND t0.tagGroup = 16 AND t0.tagElement = 32 -- Patient tags are copied at study level ! INNER JOIN DicomIdentifiers t1 ON t1.id = studies.internalId AND t1.tagGroup = 8 AND t1.tagElement = 4144 INNER JOIN DicomIdentifiers t2 ON t2.id = studies.internalId AND t2.tagGroup = 8 AND t2.tagElement = 32 WHERE series.resourceType = 2 AND t0.value = 'ID-DYL-WAL-00000008' AND t1.value like '%000000%' ESCAPE '\' ORDER BY t2 DESC LIMIT 101 Combine multiple search criteria: Search studies with a label constraints + a Metadata filter + a DICOM Tag filter + a ParentPatient filter SELECT studies.publicId, studies.internalId FROM Resources AS studies INNER JOIN WHERE resourceType = 1 AND internalId IN (SELECT id FROM DicomIdentifiers WHERE tagGroup = 16 AND tagElement = 32 AND value = $1) AND internalId IN (SELECT id FROM DicomIdentifiers WHERE tagGroup = 8 AND tagElement = 32 AND value >= $2) AND internalId IN (SELECT id FROM DicomIdentifiers WHERE tagGroup = 8 AND tagElement = 4144 AND value LIKE $3 ESCAPE '\') LIMIT 101) AS studies INNER JOIN Resources series ON series.parentId = studies.internalId GROUP BY studies.publicId ) AS studies_series INNER JOIN Resources instances ON instances.parentId = studies_series.series_internal_id GROUP BY studies_series.studies_public_id -- current search at study level (single level -> we use IN instead of INNER JOIN) SELECT publicId, internalId FROM Resources WHERE resourceType = 1 AND internalId IN (SELECT id FROM DicomIdentifiers WHERE tagGroup = 16 AND tagElement = 32 AND value = 'ID-DYL-WAL-00000008') AND internalId IN (SELECT id FROM DicomIdentifiers WHERE tagGroup = 8 AND tagElement = 32 AND value >= '19700101') AND internalId IN (SELECT id FROM DicomIdentifiers WHERE tagGroup = 8 AND tagElement = 4144 AND value LIKE '%000000%' ESCAPE '\') AND internalId IN (SELECT id FROM (SELECT id, COUNT(1) AS labelsCount FROM Labels WHERE label IN ('Label2', 'Label3') GROUP BY id) AS temp WHERE labelsCount >= 1) AND internalId IN (SELECT id FROM Metadata WHERE type=7 AND value >= '20240415T120000') LIMIT 101 -- same query with INNER JOIN to compare performance SELECT studies.publicId, studies.internalId FROM Resources AS studies INNER JOIN DicomIdentifiers t0 ON t0.id = studies.internalId AND t0.tagGroup = 16 AND t0.tagElement = 32 AND t0.value = 'ID-DYL-WAL-00000008' -- Patient tags are copied at study level ! INNER JOIN DicomIdentifiers t1 ON t1.id = studies.internalId AND t1.tagGroup = 8 AND t1.tagElement = 4144 AND t1.value like '%000000%' ESCAPE '\' INNER JOIN DicomIdentifiers t2 ON t2.id = studies.internalId AND t2.tagGroup = 8 AND t2.tagElement = 32 AND t2.value >= '19700101' INNER JOIN Metadata m ON m.id = studies.internalId AND m.type = 7 AND m.value >= '20240415T120000' WHERE studies.resourceType = 1 -- AND studies.internalId IN (SELECT id FROM (SELECT id, COUNT(1) AS labelsCount FROM labels WHERE label IN ('Label2', 'Label6') GROUP BY id) AS temp WHERE labelsCount = 2) -- All labels -- AND studies.internalId IN (SELECT id FROM (SELECT id, COUNT(1) AS labelsCount FROM labels WHERE label IN ('Label2', 'Label6') GROUP BY id) AS temp WHERE labelsCount >= 1) -- Any labels AND studies.internalId NOT IN (SELECT id FROM (SELECT id, COUNT(1) AS labelsCount FROM labels WHERE label IN ('Label2', 'Label6') GROUP BY id) AS temp WHERE labelsCount >= 1) -- None of labels ORDER BY t2.value DESC, m.value ASC LIMIT 100 OFFSET 0 We may store this in a temporary table that we can reuse e.g to retrieve, the labels, the metadata, the MainDicomTags ... CREATE TEMPORARY TABLE FilteredResourcesIds AS SELECT series.publicId, series.internalId FROM Resources AS series INNER JOIN Resources studies ON studies.internalId=series.parentId INNER JOIN DicomIdentifiers t0 ON t0.id = studies.internalId AND t0.tagGroup = 16 AND t0.tagElement = 32 -- Patient tags are copied at study level ! INNER JOIN DicomIdentifiers t1 ON t1.id = studies.internalId AND t1.tagGroup = 8 AND t1.tagElement = 4144 INNER JOIN DicomIdentifiers t2 ON t2.id = studies.internalId AND t2.tagGroup = 8 AND t2.tagElement = 32 WHERE series.resourceType = 2 AND t0.value = 'ID-DYL-WAL-00000008' AND t1.value like '%000000%' ESCAPE '\' ORDER BY t2 DESC LIMIT 101; Note: we can probably only retrieve the internalId as the key -> since it is an integer, it is more efficient than a uuid to populate the responses Retrieve all MainDicomTags at the series level SELECT internalId, publicId, tagGroup, tagElement, value FROM MainDicomTags AS tags INNER JOIN FilteredResourcesIds ON tags.id = FilteredResourcesIds.internalId; Retrieve all MainDicomTags at the study level too but attach them to the series ids SELECT series.internalId, series.publicId, tagGroup, tagElement, value FROM Resources as series INNER JOIN Resources studies ON studies.internalId=series.parentId INNER JOIN FilteredResourcesIds filtered ON filtered.internalId = series.internalId INNER JOIN MainDicomTags tags ON tags.id=studies.internalId Retrieve all Metadata from all the series SELECT metadata.id, series.publicId, metadata.type, metadata.value, metadata.revision FROM Metadata INNER JOIN FilteredResourcesIds filtered ON filtered.internalId = Metadata.id Retrieve all Labels from all the series (not tested yet) SELECT labels.id, series.publicId, label FROM Labels INNER JOIN FilteredResourcesIds filtered ON filtered.internalId = Labels.id Retrieve all parents from all the series SELECT filtered.internalId, filtered.publicId, parentLevel.internalId as parentInternalId, parentLevel.publicId as parentPublicId FROM Resources as currentLevel INNER JOIN FilteredResourcesIds filtered ON filtered.internalId = currentLevel.internalId INNER JOIN Resources parentLevel ON currentLevel.parentId = parentLevel.internalId Retrieve all children from all the series SELECT filtered.internalId, filtered.publicId, childLevel.internalId as childInternalId, childLevel.publicId as childPublicId FROM Resources as currentLevel INNER JOIN FilteredResourcesIds filtered ON filtered.internalId = currentLevel.internalId INNER JOIN Resources childLevel ON childLevel.parentId = currentLevel.internalId Retrieve one instanceId by series SELECT filtered.internalId, filtered.publicId, MIN(c0.publicId) as instancePublicId FROM Resources as currentLevel INNER JOIN FilteredResourcesIds filtered ON filtered.internalId = currentLevel.internalId INNER JOIN Resources c0 ON c0.parentId = currentLevel.internalId GROUP BY filtered.internalId, filtered.publicId TODO Retrieve all attachments from all the series