Source code for ChildProject.pipelines.anonymize

import glob
import os
import pandas as pd
import json
import re
import shutil

from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
from ChildProject.pipelines.pipeline import Pipeline


[docs]class AnonymizationPipeline(Pipeline): """Anonymize a set of its annotations (`input_set`) and saves it as `output_set`.""" DEFAULT_REPLACEMENTS = { "PrimaryChild": {"DOB": "1000-01-01"}, "ITS": { "fileName": "new_filename_1001", "timeCreated": [{"replace_value": "1000-01-01"}, {"only_time": "true"}], }, "TransferTime": { "UTCTime": [{"replace_value": "1000-01-01"}, {"only_time": "true"}], "LocalTime": [{"replace_value": "1000-01-01"}, {"only_time": "true"}], }, "ChildInfo": {"dob": "1000-01-01"}, "Child": {"DOB": "1000-01-01", "EnrollDate": "1000-01-01", "id": "A999"}, "Bar": { "startClockTime": [{"replace_value": "1000-01-01"}, {"only_time": "true"}] }, "BarSummary": { "leftBoundaryClockTime": [ {"replace_value": "1000-01-01"}, {"only_time": "true"}, ], "rightBoundaryClockTime": [ {"replace_value": "1000-01-01"}, {"only_time": "true"}, ], }, "Recording": { "startClockTime": [{"replace_value": "1000-01-01"}, {"only_time": "true"}], "endClockTime": [{"replace_value": "1000-01-01"}, {"only_time": "true"}], }, "FiveMinuteSection": { "startClockTime": [{"replace_value": "1000-01-01"}, {"only_time": "true"}], "endClockTime": [{"replace_value": "1000-01-01"}, {"only_time": "true"}], }, "Item": {"timeStamp": [{"replace_value": "1000-01-01"}, {"only_time": "true"}]}, "ProcessingJob": { "logfile": "exec10001010T100010Z_job00000001-10001010_101010_100100.upl.log" }, "ResourceSnapshot": { "timelocal": [{"replace_value": "1000-01-01"}, {"only_time": "true"}], "timegmt": [{"replace_value": "1000-01-01"}, {"only_time": "true"}], }, }
[docs] def run( self, path: str, input_set: str, output_set: str, replacements_json_dict: str = "", **kwargs ): """Anonymize a set of its annotations (`input_set`) and saves it as `output_set`.""" if input_set == output_set: raise Exception("input_set and output_set should not be equal") project = ChildProject(path) project.read() replacements = self.DEFAULT_REPLACEMENTS if replacements_json_dict: replacements = json.load(open(replacements_json_dict, "r")) input_set_path = os.path.join(project.path, "annotations", input_set, "raw") output_set_path = os.path.join(project.path, "annotations", output_set, "raw") if os.path.exists(output_set_path): raise Exception("destination {} already exists".format(output_set_path)) its_files = glob.glob(os.path.join(input_set_path, "**/*.*"), recursive=True) for its in its_files: inFile = its outFile = os.path.join( output_set_path, its[len(os.path.join(input_set_path, "")) :] ) os.makedirs(os.path.dirname(outFile), exist_ok=True) with open(inFile, "r") as inF: with open(outFile, "w") as outF: for line in inF: for node in replacements.keys(): if re.search( r"<{}\b".format(node), line ): # word boundary is important here for name, value in replacements[node].items(): if isinstance(value, list): if bool(value[1]["only_time"]) is True: line = re.sub( r'{}="[0-9\-]*'.format(name), r'{}="{}'.format( name, value[0]["replace_value"] ), line, ) continue line = re.sub( r'{}="[a-zA-Z0-9_.:\-]*"'.format(name), r'{}="{}"'.format(name, value), line, ) outF.write(line)
[docs] @staticmethod def setup_parser(parser): parser.add_argument("path", help="project path") parser.add_argument("--input-set", help="input annotation set", required=True) parser.add_argument("--output-set", help="output annotation set", required=True) parser.add_argument( "--replacements-json-dict", help="path to the replacements configuration (json dict)", required=False, default="", )