Source code for ChildProject.pipelines.metrics

from abc import ABC, abstractmethod
import os
import argparse
import datetime
import multiprocessing as mp
import numpy as np
import pandas as pd
from typing import Union, List
import yaml
from git import Repo
from git.exc import InvalidGitRepositoryError

import ChildProject
from ChildProject.pipelines.pipeline import Pipeline

from ChildProject.tables import assert_dataframe, assert_columns_presence, read_csv_with_dtype
import ChildProject.pipelines.metricsFunctions as metfunc
from ..utils import TimeInterval, time_intervals_intersect

pipelines = {}

[docs] class Metrics(ABC): """ Main class for generating metrics from a project object and a list of desired metrics :param project: ChildProject instance of the target dataset. :type project: ChildProject.projects.ChildProject :param metrics_list: pandas DataFrame containing the desired metrics (metrics functions are in :type metrics_list: pd.DataFrame :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument :type by: str, optional :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None :type recordings: Union[str, List[str], pd.DataFrame], optional :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type from_time: str, optional :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type to_time: str, optional :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a <NA> value :type rec_cols: str, optional :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default :type child_cols: str, optional :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. :type period: str, optional :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. :type segments: Union[str, pd.DataFrame], optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional """ def __init__( self, project: ChildProject.projects.ChildProject, metrics_list: pd.DataFrame, by: str = "recording_filename", recordings: Union[str, List[str], pd.DataFrame] = None, from_time: str = None, to_time: str = None, rec_cols: str = None, child_cols: str = None, period: str = None, segments: Union[str, pd.DataFrame] = None, threads: int = 1, ): self.project = project = ChildProject.annotations.AnnotationManager(self.project) self.threads = int(threads) #check that the callable column is either a callable function or a string that can be found as being part of the list of metrics in ChildProject/pipelines/ def check_callable(row): if callable(row["callable"]): return row["callable"] if isinstance(row["callable"],str): try: f = getattr(metfunc , row["callable"]) except Exception: raise ValueError("{} function is not defined and was not found in ChildProject/pipelines/".format(row["callable"])) return f else : raise ValueError("{} cannot be evaluated as a metric, must be a callable object or a string".format(row["callable"])) #block checking presence of required columns and evaluates the callable functions if isinstance(metrics_list, pd.DataFrame): if ({'callable','set'}).issubset(metrics_list.columns): metrics_list["callable"] = metrics_list.apply(check_callable,axis=1) else: raise ValueError("metrics_list parameter must contain atleast the columns [callable,set]") else: raise ValueError("metrics_list parameter must be a pandas DataFrame") metrics_list.sort_values(by="set",inplace=True) for setname in np.unique(metrics_list['set'].values): if setname not in["set"].values: raise ValueError( f"annotation set '{setname}' was not found in the index; " "check spelling and make sure the set was properly imported." ) self.metrics_list = metrics_list #necessary columns to construct the metrics join_columns = { "recording_filename", "child_id", "duration", "session_id", "session_offset", } #get existing columns of the dataset for recordings correct_cols = set(self.project.recordings.columns) if by != 'segments' and by not in correct_cols: raise ValueError("<{}> is not specified in this dataset, cannot extract by it, change your --by option".format(by)) if rec_cols: #when user requests recording columns, build the list and verify they exist (warn otherwise) rec_cols=set(rec_cols.split(",")) for i in rec_cols: if i not in correct_cols: print("Warning, requested column <{}> does not exist in recordings.csv, ignoring this column. existing columns are : {}".format(i,correct_cols)) rec_cols &= correct_cols #add wanted columns to the one we already get join_columns.update(rec_cols) self.rec_cols = rec_cols join_columns &= correct_cols #join dataset annotation with their info in recordings.csv = self.project.recordings[list(join_columns)], left_on="recording_filename", right_on="recording_filename", ) #get existing columns of the dataset for children correct_cols = set(self.project.children.columns) if child_cols: #when user requests children columns, build the list and verify they exist (warn otherwise) child_cols = set(child_cols.split(",")) child_cols.add("child_id") for i in child_cols: if i not in correct_cols: print("Warning, requested column <{}> does not exist in children.csv, ignoring this column. existing columns are : {}".format(i,correct_cols)) child_cols &= correct_cols self.child_cols = child_cols #join dataset annotation with their info in children.csv = self.project.children[list(child_cols)], left_on="child_id", right_on="child_id", ) else: self.child_cols = None = by self.period = period self.segments = segments self.recordings = Pipeline.recordings_from_list(recordings) #If the extraction is done on segments if segments is not None: #we enforce that incompatible arguments are not set assert by == 'segments' and period is None and recordings is None and from_time is None and to_time is None, "the <segments> option can not be combined with options [period,recordings,from_time,to_time], and <by> should be set to 'segments'" dtypes = {'recording_filename':'string','segment_onset':'Int64', 'segment_offset':'Int64'} #use the DataFrame provided or import it from a csv file if isinstance(segments, pd.DataFrame): self.segments = segments.astype(dtypes) else: self.segments = read_csv_with_dtype(segments,dtypes) #check that required columns are present and dataframe not empty assert_dataframe("segments", self.segments, not_empty=True) assert_columns_presence( "segments", self.segments, {"recording_filename", "segment_onset", "segment_offset"}, ) #not on segments else: #build a dataframe with all the periods we will want for each unit if self.period: self.periods = pd.interval_range( start=datetime.datetime(1900, 1, 1, 0, 0, 0, 0), end=datetime.datetime(1900, 1, 2, 0, 0, 0, 0), freq=self.period, closed="both", ) self.periods= pd.DataFrame(self.periods.to_tuples().to_list(),columns=['period_start','period_end']) #turn from_time and to to_time to datetime objects if from_time: try: self.from_time = datetime.datetime.strptime(from_time, "%H:%M:%S") except: raise ValueError(f"invalid value for from_time ('{from_time}'); should have HH:MM:SS format instead") else: self.from_time = None if to_time: try: self.to_time = datetime.datetime.strptime(to_time, "%H:%M:%S") except: raise ValueError(f"invalid value for to_time ('{to_time}'); should have HH:MM:SS format instead") else: self.to_time = None self._initiate_metrics_df() def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) pipelines[cls.SUBCOMMAND] = cls def _process_unit(self,row): """for one unit (i.e. 1 {recording|session|child} [period]) compute the list of required metrics and store the results in the current row of self.metrics :param row: index and Series of the unit to process, to be modified with the results :type row: (int , pandas.Series) :return: Series containing all the computed metrics result for that unit :rtype: pandas.Series """ #row[0] is the index of the row we are processing #row[1] is the actual Series containing all the metrics for the currently processed line prev_set = "" duration_set = 0 for i, line in self.metrics_list.iterrows(): curr_set = line["set"] if prev_set != curr_set: index, annotations = self.retrieve_segments([curr_set],row[1]) if index.shape[0]: duration_set = ( index["range_offset"] - index["range_onset"] ).sum() else: duration_set = 0 row[1]["duration_{}".format(line["set"])] = duration_set prev_set = curr_set name, value = line["callable"](annotations, duration_set, **line.drop(['callable', 'set']).dropna().to_dict()) row[1][name] = value return row[1]
[docs] def extract(self): """from the initiated self.metrics, compute each row metrics (handles threading) Once the Metrics class is initialized, call this function to extract the metrics and populate self.metrics :return: DataFrame of computed metrics :rtype: pandas.DataFrame """ if self.threads == 1: self.metrics = pd.DataFrame( [self._process_unit(row) for row in self.metrics.iterrows()] ) else: with mp.Pool( processes=self.threads if self.threads >= 1 else mp.cpu_count() ) as pool: self.metrics = pd.DataFrame(, self.metrics.iterrows()) ) if self.period: self.metrics['period_start'] = self.metrics['period_start'].dt.strftime('%H:%M:%S') self.metrics['period_end'] = self.metrics['period_end'].dt.strftime('%H:%M:%S') return self.metrics
[docs] def retrieve_segments(self, sets: List[str], row: str): """from a list of sets and a row identifying the unit computed, return the relevant annotation segments :param sets: List of annotation sets to keep :type sets: List[str] :param row: Series storing the unit to compute information :type row: pandas.Series :return: relevant annotation DataFrame and index DataFrame :rtype: (pandas.DataFrame , pandas.DataFrame) """ #if extraction from segments, annotations are retrieved from get_within_ranges if self.segments is not None: matches = pd.DataFrame( [[row['recording_filename'],row['segment_onset'], row['segment_offset']]], columns=['recording_filename','range_onset', 'range_offset']), sets= sets, missing_data = 'warn') # else prepare and use get_within_time_range else: annotations =[[] == row[]] annotations = annotations[annotations["set"].isin(sets)] # restrict to time ranges if self.from_time and self.to_time: # add the periods columns if self.period: st_hour = row["period_start"] end_hour = row["period_end"] intervals = time_intervals_intersect(TimeInterval(self.from_time,self.to_time),TimeInterval(st_hour,end_hour)) matches = pd.concat([,i) for i in intervals],ignore_index =True) if intervals else pd.DataFrame() else: matches = annotations, TimeInterval(self.from_time, self.to_time)) elif self.period: # add the periods columns st_hour = row["period_start"] end_hour = row["period_end"] matches = annotations, TimeInterval(st_hour,end_hour)) else: matches = annotations if matches.shape[0]: segments = else: # no annotations for that unit return pd.DataFrame(), pd.DataFrame() # prevent overflows segments["duration"] = ( (segments["segment_offset"] - segments["segment_onset"]) .astype(float) .fillna(0) ) return matches, segments
def _initiate_metrics_df(self): """builds a dataframe with all the rows necessary and their labels eg : - one row per child if --by child_id and no --period - 48 rows if 2 recordings in the corpus --period 1h --by recording_filename Then the extract() method should populate the dataframe with actual metrics """ #build the metrics dataframe from the segments argument if self.segments is not None: recordings = self.project.get_recordings_from_list(self.segments['recording_filename'].unique()) = 'recording_filename' self.metrics = self.segments.copy() # else use the list of recordings of the dataset and the by option else: recordings = self.project.get_recordings_from_list(self.recordings) self.metrics = pd.DataFrame(recordings[].unique(), columns=[]) if self.period: #if period, use the self.periods dataframe to build all the list of segments per unit self.metrics["key"]=0 #with old versions of pandas, we are forced to have a common column to do a cross join, we drop the column after self.periods["key"]=0 self.metrics = pd.merge(self.metrics,self.periods,on='key',how='outer').drop('key',axis=1) # add info for child_id self.metrics["child_id"] = self.metrics.apply( lambda row:self.project.recordings[self.project.recordings[] == row[] ]["child_id"].iloc[0], axis=1) # get and add to dataframe children.csv columns asked if self.child_cols: for label in self.child_cols: self.metrics[label]= self.metrics.apply(lambda row: self.project.children[ self.project.children["child_id"] == row["child_id"] ][label].iloc[0], axis=1) # this loop is for the purpose of checking for name duplicates in the metrics # we do a dry run on the first line with no annotations bc impractical to check in multiprocessing df = pd.DataFrame() duration_set = 0 names = set() for i, line in self.metrics_list.iterrows(): name, value = line["callable"](df, duration_set, **line.drop(['callable', 'set'], errors='ignore').dropna().to_dict()) if name in names: raise ValueError('the metric name <{}> is used multiple times, make sure it is unique'.format(name)) else: names.add(name) # checking that columns added by the user are unique (e.g. date_iso may be different when extract by child_id), replace with NA if they are not def check_unicity(row, label): value=self.project.recordings[ self.project.recordings[] == row[] ][label].drop_duplicates() # check that there is only one row remaining (ie this column has a unique value for that unit) if len(value) == 1: return value.iloc[0] # otherwise, leave the column as NA else: return np.nan #get and add to dataframe recordings.csv columns asked if self.rec_cols: for label in self.rec_cols: self.metrics[label] = self.metrics.apply(lambda row : check_unicity(row,label),axis=1)
[docs] class CustomMetrics(Metrics): """metrics extraction from a csv file. Extracts a number of metrics listed in a csv file as a dataframe. the csv file must contain the columns : - 'callable' which is the name of the wanted metric from the list of available metrics - 'set' which is the set of annotations to use for that specific metric (make sure this set has the required columns for that metric) - 'name' is optional, this is the name to give to that metric (if not given, a default name will be attributed) - any other necessary argument for the given metrics (eg the voc_speaker_ph metric requires the 'speaker' argument: add a column 'speaker' in the csv file and fill its cells for this metric with the wanted value (CHI|FEM|MAL|OCH)) :param project: ChildProject instance of the target dataset. :type project: ChildProject.projects.ChildProject :param metrics: name of the csv file listing the metrics to extract :type metrics: str :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None :type recordings: Union[str, List[str], pd.DataFrame], optional :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type from_time: str, optional :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type to_time: str, optional :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a <NA> value :type rec_cols: str, optional :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default :type child_cols: str, optional :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument :type by: str, optional :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. :type period: str, optional :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. :type segments: Union[str, pd.DataFrame], optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional """ SUBCOMMAND = "custom" def __init__( self, project: ChildProject.projects.ChildProject, metrics: str, recordings: Union[str, List[str], pd.DataFrame] = None, from_time: str = None, to_time: str = None, rec_cols: str = None, child_cols: str = None, by: str = "recording_filename", period: str = None, segments: Union[str, pd.DataFrame] = None, threads: int = 1, ): metrics_df = pd.read_csv(metrics) super().__init__(project, metrics_df, by=by, recordings=recordings, from_time=from_time, to_time=to_time, rec_cols=rec_cols, child_cols=child_cols, period=period, segments=segments, threads=threads)
[docs] @staticmethod def add_parser(subparsers, subcommand): parser = subparsers.add_parser(subcommand, help="metrics from a csv file") parser.add_argument("metrics", help="name if the csv file containing the list of metrics", )
[docs] class LenaMetrics(Metrics): """LENA metrics extractor. Extracts a number of metrics from the LENA .its annotations. :param project: ChildProject instance of the target dataset. :type project: ChildProject.projects.ChildProject :param set: name of the set associated to the .its annotations :type set: str :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None :type recordings: Union[str, List[str], pd.DataFrame], optional :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type from_time: str, optional :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type to_time: str, optional :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a <NA> value :type rec_cols: str, optional :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default :type child_cols: str, optional :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument :type by: str, optional :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. :type period: str, optional :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. :type segments: Union[str, pd.DataFrame], optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional """ SUBCOMMAND = "lena" def __init__( self, project: ChildProject.projects.ChildProject, set: str, recordings: Union[str, List[str], pd.DataFrame] = None, from_time: str = None, to_time: str = None, rec_cols: str = None, child_cols: str = None, by: str = "recording_filename", period: str = None, segments: Union[str, pd.DataFrame] = None, threads: int = 1, ): self.set = set METRICS = pd.DataFrame(np.array( [["voc_speaker_ph",self.set,'FEM'], ["voc_speaker_ph",self.set,'MAL'], ["voc_speaker_ph",self.set,'OCH'], ["voc_speaker_ph",self.set,'CHI'], ["voc_dur_speaker_ph",self.set,'FEM'], ["voc_dur_speaker_ph",self.set,'MAL'], ["voc_dur_speaker_ph",self.set,'OCH'], ["voc_dur_speaker_ph",self.set,'CHI'], ["avg_voc_dur_speaker",self.set,'FEM'], ["avg_voc_dur_speaker",self.set,'MAL'], ["avg_voc_dur_speaker",self.set,'OCH'], ["avg_voc_dur_speaker",self.set,'CHI'], ["wc_speaker_ph",self.set,'FEM'], ["wc_speaker_ph",self.set,'MAL'], ["wc_adu_ph",self.set,pd.NA], ["lp_n",self.set,pd.NA], ["lp_dur",self.set,pd.NA], ["lena_CVC",self.set,pd.NA], ["lena_CTC",self.set,pd.NA], ]), columns=["callable","set","speaker"]) super().__init__(project, METRICS, by=by, recordings=recordings, period=period, from_time=from_time, to_time=to_time, rec_cols=rec_cols, child_cols=child_cols, segments=segments, threads=threads) if self.set not in["set"].values: raise ValueError( f"annotation set '{self.set}' was not found in the index; " "check spelling and make sure the set was properly imported." )
[docs] @staticmethod def add_parser(subparsers, subcommand): parser = subparsers.add_parser(subcommand, help="LENA metrics") parser.add_argument("set", help="name of the LENA its annotations set")
[docs] class AclewMetrics(Metrics): """ACLEW metrics extractor. Extracts a number of metrics from the ACLEW pipeline annotations, which includes: - The Voice Type Classifier by Lavechin et al. (arXiv:2005.12656) - The Automatic LInguistic Unit Count Estimator (ALICE) by Räsänen et al. (doi:10.3758/s13428-020-01460-x) - The VoCalisation Maturity model (VCMNet) by Al Futaisi et al. (doi:10.1145/3340555.3353751) :param project: ChildProject instance of the target dataset. :type project: ChildProject.projects.ChildProject :param vtc: name of the set associated to the VTC annotations :type vtc: str :param alice: name of the set associated to the ALICE annotations :type alice: str :param vcm: name of the set associated to the VCM annotations :type vcm: str :param recordings: recordings to sample from; if None, all recordings will be sampled, defaults to None :type recordings: Union[str, List[str], pd.DataFrame], optional :param from_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type from_time: str, optional :param to_time: If specified (in HH:MM:SS format), ignore annotations outside of the given time-range, defaults to None :type to_time: str, optional :param rec_cols: comma separated columns from recordings.csv to include in the outputted metrics (optional), recording_filename,session_id,child_id,duration are always included if possible and dont need to be specified. Any column that is not unique for a given unit (eg date_iso for a child_id being recorded on multiple days) will output a <NA> value :type rec_cols: str, optional :param child_cols: comma separated columns from children.csv to include in the outputted metrics (optional), None by default :type child_cols: str, optional :param by: unit to extract metric from (recording_filename, experiment, child_id, session_id, segments), defaults to 'recording_filename', 'segments' is mandatory if passing the segments argument :type by: str, optional :param period: time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. :type period: str, optional :param segments: DataFrame or path to csv file of the segments to extract from, containing 'recording_filename', 'segment_onset' and 'segment_offset' columns. To use this option, the option must be set to 'segments'. Also, this option cannot be combined with options [recordings,period,from_time,to_time]. :type segments: Union[str, pd.DataFrame], optional :param threads: amount of threads to run on, defaults to 1 :type threads: int, optional """ SUBCOMMAND = "aclew" def __init__( self, project: ChildProject.projects.ChildProject, vtc: str = "vtc", alice: str = "alice", vcm: str = "vcm", recordings: Union[str, List[str], pd.DataFrame] = None, from_time: str = None, to_time: str = None, rec_cols: str = None, child_cols: str = None, period: str = None, segments: Union[str, pd.DataFrame] = None, by: str = "recording_filename", threads: int = 1, ): self.vtc = vtc self.alice = alice self.vcm = vcm am = ChildProject.annotations.AnnotationManager(project) #temporary instance to check for existing sets. This is suboptimal because an annotation manager will be created by Metrics. However, the metrics class raises a ValueError for every set passed that does not exist, here we want to check in advance which of the alice and vcm sets exist without raising an error METRICS = np.array( [["voc_speaker_ph",self.vtc,'FEM'], ["voc_speaker_ph",self.vtc,'MAL'], ["voc_speaker_ph",self.vtc,'OCH'], ["voc_speaker_ph",self.vtc,'CHI'], ["voc_dur_speaker_ph",self.vtc,'FEM'], ["voc_dur_speaker_ph",self.vtc,'MAL'], ["voc_dur_speaker_ph",self.vtc,'OCH'], ["voc_dur_speaker_ph",self.vtc,'CHI'], ["avg_voc_dur_speaker",self.vtc,'FEM'], ["avg_voc_dur_speaker",self.vtc,'MAL'], ["avg_voc_dur_speaker",self.vtc,'OCH'], ["avg_voc_dur_speaker",self.vtc,'CHI'], ["simple_CTC_ph",self.vtc,pd.NA], ]) if self.alice not in am.annotations["set"].values: print(f"The ALICE set ('{self.alice}') was not found in the index.") else: METRICS = np.concatenate((METRICS,np.array( [["wc_speaker_ph",self.alice,'FEM'], ["wc_speaker_ph",self.alice,'MAL'], ["sc_speaker_ph",self.alice,'FEM'], ["sc_speaker_ph",self.alice,'MAL'], ["pc_speaker_ph",self.alice,'FEM'], ["pc_speaker_ph",self.alice,'MAL'], ["wc_adu_ph",self.alice,pd.NA], ["sc_adu_ph",self.alice,pd.NA], ["pc_adu_ph",self.alice,pd.NA], ]))) if self.vcm not in am.annotations["set"].values: print(f"The vcm set ('{self.vcm}') was not found in the index.") else: METRICS = np.concatenate((METRICS,np.array( [["cry_voc_speaker_ph",self.vcm,'CHI'], ["cry_voc_dur_speaker_ph",self.vcm,'CHI'], ["avg_cry_voc_dur_speaker",self.vcm,'CHI'], ["can_voc_speaker_ph",self.vcm,'CHI'], ["can_voc_dur_speaker_ph",self.vcm,'CHI'], ["avg_can_voc_dur_speaker",self.vcm,'CHI'], ["non_can_voc_speaker_ph",self.vcm,'CHI'], ["non_can_voc_dur_speaker_ph",self.vcm,'CHI'], ["avg_non_can_voc_dur_speaker",self.vcm,'CHI'], ["lp_n",self.vcm,pd.NA], ["lp_dur",self.vcm,pd.NA], ["cp_n",self.vcm,pd.NA], ["cp_dur",self.vcm,pd.NA], ]))) METRICS = pd.DataFrame(METRICS, columns=["callable","set","speaker"]) super().__init__(project, METRICS,by=by, recordings=recordings, period=period, from_time=from_time, to_time=to_time, rec_cols=rec_cols, child_cols=child_cols, segments=segments, threads=threads)
[docs] @staticmethod def add_parser(subparsers, subcommand): parser = subparsers.add_parser(subcommand, help="LENA metrics") parser.add_argument("--vtc", help="vtc set", default="vtc") parser.add_argument("--alice", help="alice set", default="alice") parser.add_argument("--vcm", help="vcm set", default="vcm")
[docs] class MetricsPipeline(Pipeline): def __init__(self): self.metrics = []
[docs] def run(self, path, destination, pipeline, func=None, **kwargs): self.destination = destination #build a dictionary with all parameters used parameters = locals() parameters = { key: parameters[key] for key in parameters if key not in ["self", "kwargs", "func"] #not sure what func parameter is for, seems unecessary to keep } for key in kwargs: #add all kwargs to dictionary parameters[key] = kwargs[key] self.project = ChildProject.projects.ChildProject(path) try: datarepo = Repo(path) parameters['dataset_hash'] = datarepo.head.object.hexsha except InvalidGitRepositoryError: print("Your dataset is not currently a git repository") if pipeline not in pipelines: raise NotImplementedError(f"invalid pipeline '{pipeline}'") metrics = pipelines[pipeline](self.project, **kwargs) metrics.extract() self.metrics = metrics.metrics self.metrics.to_csv(self.destination,index=False) # get the df of metrics used from the Metrics class metrics_df = metrics.metrics_list metrics_df['callable'] = metrics_df.apply(lambda row: row['callable'].__name__, axis=1) #from the callables used, find their name back parameters['metrics_list'] = [ {k:v for k,v in m.items() if pd.notnull(v)} for m in metrics_df.to_dict(orient='records')] date ="%Y%m%d_%H%M%S") # create a yaml file with all the parameters used self.parameters_path = os.path.splitext(self.destination)[0] + "_parameters_{}.yml".format(date) print("exported metrics to {}".format(self.destination)) yaml.dump( { "package_version": ChildProject.__version__, "date": date, "parameters": parameters, }, open(self.parameters_path, "w+"),sort_keys=False, ) print("exported sampler parameters to {}".format(self.parameters_path)) return self.metrics
[docs] @staticmethod def setup_parser(parser): parser.add_argument("path", help="path to the dataset") parser.add_argument("destination", help="segments destination") subparsers = parser.add_subparsers(help="pipeline", dest="pipeline") for pipeline in pipelines: pipelines[pipeline].add_parser(subparsers, pipeline) parser.add_argument( "--recordings", help="path to a CSV dataframe containing the list of recordings to sample from (by default, all recordings will be sampled). The CSV should have one column named recording_filename.", default=None, ) parser.add_argument( "--by", help="units to sample from (default behavior is to sample by recording)", choices=["recording_filename", "session_id", "child_id","experiment","segments"], default="recording_filename", ) parser.add_argument( "--segments", help="path to a CSV dataframe containing the list of segments to sample from. The CSV should have 3 columns named recording_filename, segment_onset, segment_offset. --by must be set to 'segments', Can not be used along with options [--period,--recordings,--from-tim,--to-time]", default=None, ) parser.add_argument( "--period", help="time units to aggregate (optional); equivalent to ``pandas.Grouper`` freq argument. The resulting metrics will be split for each unit across all the resulting periods.", default=None, ) parser.add_argument( "-f", "--from-time", help="time range start in HH:MM:SS format (optional)", default=None, ) parser.add_argument( "-t", "--to-time", help="time range end in HH:MM:SS format (optional)", default=None, ) parser.add_argument( "--rec-cols", help="comma separated columns from recordings.csv to include in the outputted metrics (optional), NA if ambiguous", default=None, ) parser.add_argument( "--child-cols", help="comma separated columns from children.csv to include in the outputted metrics (optional), NA if ambiguous", default=None, ) parser.add_argument( "--threads", help="amount of threads to run on", default=1, type=int )
[docs] class MetricsSpecificationPipeline(Pipeline): def __init__(self): self.metrics = []
[docs] def run(self, parameters_input,func=None): #build a dictionary with all parameters used parameters = None with open(parameters_input, "r") as stream: try: parameters = yaml.safe_load(stream) if 'parameters' in parameters: parameters = parameters['parameters'] except yaml.YAMLError as exc: raise yaml.YAMLError("parsing of the parameters file {} failed. See above exception for more details".format(parameters_input)) from exc if parameters: if "path" not in parameters : raise ValueError("the parameter file {} must contain at least the 'path' key specifying the path to the dataset".format(parameters_input)) if "destination" not in parameters : raise ValueError("the parameter file {} must contain the 'destination' key specifying the file to output the metrics to".format(parameters_input)) if "metrics_list" not in parameters : raise ValueError("the parameter file {} must contain the 'metrics_list' key containing the list of the desired metrics".format(parameters_input)) try: metrics_df = pd.DataFrame(parameters["metrics_list"]) except Exception as e: raise ValueError("The 'metrics_list' key in {} must be a list of elements".format(parameters_input)) from e else: raise ValueError("could not find any parameters in {}".format(parameters_input)) try: datarepo = Repo(parameters["path"]) parameters['dataset_hash'] = datarepo.head.object.hexsha except InvalidGitRepositoryError: print("Your dataset is not currently a git repository") self.project = ChildProject.projects.ChildProject(parameters["path"]) self.destination = parameters['destination'] unwanted_keys = {'metrics', 'pipeline'} for i in unwanted_keys: if i in parameters : del parameters[i] arguments = { key: parameters[key] for key in parameters if key not in {"metrics_list", "path", "destination","dataset_hash"} } try: metrics = Metrics(self.project, metrics_df, **arguments) except TypeError as e: raise ValueError('Unrecognized parameter found {}'.format(e.args[0][46:])) from e metrics.extract() self.metrics = metrics.metrics self.metrics.to_csv(self.destination,index=False) # get the df of metrics used from the Metrics class metrics_df = metrics.metrics_list metrics_df['callable'] = metrics_df.apply(lambda row: row['callable'].__name__, axis=1) #from the callables used, find their name back parameters['metrics_list'] = [ {k:v for k,v in m.items() if pd.notnull(v)} for m in metrics_df.to_dict(orient='records')] date ="%Y%m%d_%H%M%S") # create a yaml file with all the parameters used self.parameters_path = os.path.splitext(self.destination)[0] + "_parameters_{}.yml".format(date) print("exported metrics to {}".format(self.destination)) yaml.dump( { "package_version": ChildProject.__version__, "date": date, "parameters": parameters, }, open(self.parameters_path, "w+"),sort_keys=False, ) print("exported metrics parameters to {}".format(self.parameters_path)) return self.metrics
[docs] @staticmethod def setup_parser(parser): parser.add_argument("parameters_input", help="path to the yml file with all parameters")