import pandas as pd
import numpy as np
import ast
import re
import functools
from typing import Union, Set, Tuple
"""
This file lists all the metrics functions commonly used.
New metrics can be added by defining new functions for the Metrics class to use :
- Create a new function using the same arguments (i.e. annotations, duration, **kwargs)
- Define calculation of the metric with:
- annotations, which is a dataframe containing all the relevant annotated segments to use. It contains the
annotation content (https://childproject.readthedocs.io/en/latest/format.html#id10) joined with the annotation
index info (https://childproject.readthedocs.io/en/latest/format.html#id11) as well as any column that was
requested to be added to the results by the user using --child-cols or --rec-cols (eg --child-cols child_dob,
languages will make columns 'child_dob' and 'languages' available)
- duration which is the duration of audio annotated in milliseconds
- kwargs, whatever keyword parameter you chose to pass to the function (except 'name', 'callable', 'set' which can
not be used). This will need to be given with the list of metrics when called
- Wrap you function with the 'metricFunction' decorator to make it callable by the pipeline, read metricFunction help
for more info
!! Metrics functions should still behave and return the correct result when receiving an empty dataframe
"""
# error message in case of missing columns in annotations
MISSING_COLUMNS = 'The given set <{}> does not have the required column(s) <{}> for computing the {} metric'
RESERVED = {'set', 'name', 'callable'} # arguments reserved usage. use other keyword labels.
[docs]def metricFunction(args: set, columns: Union[Set[str], Tuple[Set[str], ...]], empty_value=0, default_name: str = None):
"""Decorator for all metrics functions to make them ready to be called by the pipeline.
:param args: set of required keyword arguments for that function, raise ValueError if were not given \
you cannot use keywords [name, callable, set] as they are reserved
:type args: set
:param columns: required columns in the dataframe given, missing columns raise ValueError
:type columns: set
:param default_name: default name to use for the metric in the resulting dataframe. Every keyword argument found in the name will be replaced by its value (e.g. 'voc_speaker_ph' uses kwarg 'speaker' so if speaker = 'CHI', name will be 'voc_chi_ph'). if no name is given, the __name__ of the function is used
:type default_name: str
:param empty_value: value to return when annotations are empty but the unit was annotated (e.g. 0 for counts like voc_speaker_ph , None for proportions like lp_n)
:type empty_value: float|int
:return: new function to substitute the metric function
:rtype: Callable
"""
def decorator(function):
for a in args:
if a in RESERVED:
raise ValueError(
'Error when defining {} with required argument {}, you cannot use reserved keywords {},\
change your required argument name'.format(
function.__name__, a, RESERVED))
@functools.wraps(function)
def new_func(annotations: pd.DataFrame, duration: int, **kwargs):
for arg in args:
if arg not in kwargs:
raise ValueError(f"{function.__name__} metric needs an argument <{arg}>")
# if a name is explicitly given, use it
if 'name' in kwargs and not pd.isnull(kwargs['name']) and kwargs['name']:
metric_name = kwargs['name']
# else if a default name for the function exists, use the function name
elif default_name:
metric_name = default_name
# else, no name was found, use the name of the function
else:
metric_name = function.__name__
metric_name_replaced = metric_name
# metric_name is the basename used to designate this metric (voc_speaker_ph),
# metric_name_replaced replaces the values of kwargs
# found in the name by their values, giving the metric name for that instance only (voc_chi_ph)
for arg in kwargs:
metric_name_replaced = re.sub(arg, str(kwargs[arg]).lower(), metric_name_replaced)
if annotations.shape[0]:
# if multiple possibilities of columns, explore each and fail only if each combination is missing
# a column, if one possibility, fail if a column is missing
if isinstance(columns, tuple) and len(columns) > 0 and isinstance(columns[0], set):
missing_columns = []
for possible_cols in columns:
possible_missing = possible_cols - set(annotations.columns)
if possible_missing:
missing_columns.append(possible_missing)
# if we have as many cases of missing columns as possibilities, we can't compute the metric
if len(missing_columns) == len(columns):
raise ValueError(
MISSING_COLUMNS.format(annotations['set'].iloc[0],
' or '.join([str(s) for s in missing_columns]),
metric_name))
else:
missing_columns = columns - set(annotations.columns)
if missing_columns:
raise ValueError(MISSING_COLUMNS.format(annotations['set'].iloc[0], missing_columns, metric_name))
res = function(annotations, duration, **kwargs)
else: # no annotation for that unit
res = empty_value if duration else None # duration != 0 => was annotated but not segments there
return metric_name_replaced, res
return new_func
return decorator
[docs]def peak_hour_metric(empty_value=0):
"""
empty_value : should repeat the empty value of the metric function wrapper (as this will be used for empty periods)
"""
def decorator(function):
"""Decorator a metric function to select the maximum value observed over 1h periods. function is prefixed with
'peak_'
"""
@functools.wraps(function)
def new_func(annotations: pd.DataFrame, duration: int, **kwargs):
# time to consider for periods, here 1h by default, else put it in kwargs
period_time = 3600000 if 'period_time' not in kwargs else kwargs['period_time']
periods = duration // period_time # number of hours to consider
# what hour it belongs to (we made the choice of using onset to choose the hour)
annotations['hour_number_metric'] = annotations['segment_onset'] // period_time
result_array = np.array([])
for i in range(periods):
# select the annotations for this hour
period_annotations = annotations[annotations['hour_number_metric'] == i]
if period_annotations.shape[0]:
# compute metric for the period
metric = function(period_annotations, period_time, **kwargs)
else:
metric = empty_value
result_array = np.append(result_array, metric) # store the result
# if we have results, return the max, else return NaN
if len(result_array):
return np.nanmax(result_array)
else:
return np.nan
# wraps will give the same name and doc, so we need to slightly edit them for the peak function
new_func.__doc__ = "Computing the peak for 1h for the following metric:\n\n" + function.__doc__
new_func.__name__ = "peak_" + function.__name__
new_func.__qualname__ = "peak_" + function.__qualname__
return new_func
return decorator
[docs]def per_hour_metric():
"""
"""
def decorator(function):
"""Decorator creating a metric function controlling the original value by time. function is suffixed with '_ph'
"""
@functools.wraps(function)
def new_func(annotations: pd.DataFrame, duration: int, **kwargs):
# time to consider for periods, here 1h by default, else put it in kwargs
return function(annotations, duration, **kwargs) * (3600000 / duration)
# wraps will give the same name and doc, so we need to slightly edit them for the peak function
new_func.__doc__ = function.__doc__ + "\nThis value is a 'per hour' value."
new_func.__name__ = function.__name__ + '_ph'
new_func.__qualname__ = function.__qualname__ + '_ph'
return new_func
return decorator
[docs]def voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of vocalizations for a given speaker type
Required keyword arguments:
- speaker : speaker_type to use
"""
return annotations[annotations["speaker_type"] == kwargs["speaker"]].shape[0]
# Decorate for the peak metric, per hour metric, and then the classic metric to avoid conflicts of decoration
peak_voc_speaker = metricFunction({"speaker"}, {"speaker_type"})(peak_hour_metric()(voc_speaker))
voc_speaker_ph = metricFunction({"speaker"}, {"speaker_type"})(per_hour_metric()(voc_speaker))
voc_speaker = metricFunction({"speaker"}, {"speaker_type"})(voc_speaker)
[docs]def voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""total duration of vocalizations by a given speaker type in milliseconds per hour
Required keyword arguments:
- speaker : speaker_type to use
"""
return annotations[annotations["speaker_type"] == kwargs["speaker"]]["duration"].sum()
# Decorate for the peak metric, per hour metric, and then the classic metric to avoid conflicts of decoration
peak_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "duration"})(peak_hour_metric()(voc_dur_speaker))
voc_dur_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "duration"})(per_hour_metric()(voc_dur_speaker))
voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "duration"})(voc_dur_speaker)
[docs]@metricFunction({"speaker"}, {"speaker_type", "duration"}, np.nan)
def avg_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""average duration in milliseconds of vocalizations for a given speaker type
Required keyword arguments:
- speaker : speaker_type to use
"""
return annotations[annotations["speaker_type"] == kwargs["speaker"]]["duration"].mean()
[docs]def wc_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of words for a given speaker type
Required keyword arguments:
- speaker : speaker_type to use
"""
return annotations[annotations["speaker_type"] == kwargs["speaker"]]["words"].sum()
peak_wc_speaker = metricFunction({"speaker"}, {"speaker_type", "words"})(peak_hour_metric()(wc_speaker))
wc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "words"})(per_hour_metric()(wc_speaker))
wc_speaker = metricFunction({"speaker"}, {"speaker_type", "words"})(wc_speaker)
[docs]def sc_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of syllables for a given speaker type
Required keyword arguments:
- speaker : speaker_type to use
"""
return annotations[annotations["speaker_type"] == kwargs["speaker"]]["syllables"].sum()
peak_sc_speaker = metricFunction({"speaker"}, {"speaker_type", "syllables"})(peak_hour_metric()(sc_speaker))
sc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "syllables"})(per_hour_metric()(sc_speaker))
sc_speaker = metricFunction({"speaker"}, {"speaker_type", "syllables"})(sc_speaker)
[docs]def pc_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of phonemes for a given speaker type
Required keyword arguments:
- speaker : speaker_type to use
"""
return annotations[annotations["speaker_type"] == kwargs["speaker"]]["phonemes"].sum()
peak_pc_speaker = metricFunction({"speaker"}, {"speaker_type", "phonemes"})(peak_hour_metric()(pc_speaker))
pc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "phonemes"})(per_hour_metric()(pc_speaker))
pc_speaker = metricFunction({"speaker"}, {"speaker_type", "phonemes"})(pc_speaker)
[docs]def wc_adu(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of words for all speakers
Required keyword arguments:
"""
return annotations["words"].sum()
peak_wc_adu = metricFunction(set(), {"words"})(peak_hour_metric()(wc_adu))
wc_adu_ph = metricFunction(set(), {"words"})(per_hour_metric()(wc_adu))
wc_adu = metricFunction(set(), {"words"})(wc_adu)
[docs]def sc_adu(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of syllables for all speakers
Required keyword arguments:
"""
return annotations["syllables"].sum()
peak_sc_adu = metricFunction(set(), {"syllables"})(peak_hour_metric()(sc_adu))
sc_adu_ph = metricFunction(set(), {"syllables"})(per_hour_metric()(sc_adu))
sc_adu = metricFunction(set(), {"syllables"})(sc_adu)
[docs]def pc_adu(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of phonemes for all speakers
Required keyword arguments:
"""
return annotations["phonemes"].sum()
peak_pc_adu = metricFunction(set(), {"phonemes"})(peak_hour_metric()(pc_adu))
pc_adu_ph = metricFunction(set(), {"phonemes"})(per_hour_metric()(pc_adu))
pc_adu = metricFunction(set(), {"phonemes"})(pc_adu)
[docs]def cry_voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of cry vocalizations for a given speaker (based on vcm_type or lena cries)
Required keyword arguments:
- speaker : speaker_type to use
"""
if 'vcm_type' in annotations.columns:
return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) &
(annotations["vcm_type"] == "Y")].shape[0]
# elif 'cries' in annotations.columns:
else:
return annotations[annotations['speaker_type'] == kwargs["speaker"]]["cries"].apply(lambda x: len(ast.literal_eval(x))).sum()
peak_cry_voc_speaker = metricFunction({"speaker"}, ({"speaker_type", "vcm_type"}, {"speaker_type", "cries"})
)(peak_hour_metric()(cry_voc_speaker))
cry_voc_speaker_ph = metricFunction({"speaker"}, ({"speaker_type", "vcm_type"}, {"speaker_type", "cries"})
)(per_hour_metric()(cry_voc_speaker))
cry_voc_speaker = metricFunction({"speaker"}, ({"speaker_type", "vcm_type"}, {"speaker_type", "cries"})
)(cry_voc_speaker)
[docs]def cry_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""total duration of cry vocalizations by a given speaker type in milliseconds (based on vcm_type or lena cry)
Required keyword arguments:
- speaker : speaker_type to use
"""
if 'vcm_type' in annotations.columns and 'duration' in annotations.columns:
return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) &
(annotations["vcm_type"] == "Y")]["duration"].sum()
# elif 'child_cry_vfx_len' in annotations.columns:
else:
return annotations[annotations['speaker_type'] == kwargs["speaker"]]["child_cry_vfx_len"].sum()
peak_cry_voc_dur_speaker = metricFunction({"speaker"}, ({"speaker_type", "vcm_type", "duration"}, {"speaker_type", "child_cry_vfx_len"}))(
peak_hour_metric()(cry_voc_dur_speaker))
cry_voc_dur_speaker_ph = metricFunction({"speaker"}, ({"speaker_type", "vcm_type", "duration"}, {"speaker_type", "child_cry_vfx_len"}))(
per_hour_metric()(cry_voc_dur_speaker))
cry_voc_dur_speaker = metricFunction({"speaker"}, ({"speaker_type", "vcm_type", "duration"}, {"speaker_type", "child_cry_vfx_len"}))(cry_voc_dur_speaker)
[docs]@metricFunction({"speaker"}, ({"speaker_type", "vcm_type", "duration"}, {'speaker_type', "child_cry_vfx_len", "cries"}), np.nan)
def avg_cry_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""average duration of cry vocalizations by a given speaker type (based on vcm_type or lena cries)
Required keyword arguments:
- speaker : speaker_type to use
"""
if 'vcm_type' in annotations.columns and 'duration' in annotations.columns:
value = annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) &
(annotations["vcm_type"] == "Y")]["duration"].mean()
else:
annots = annotations[annotations['speaker_type'] == kwargs["speaker"]]
value = annots["child_cry_vfx_len"].sum() / annots["cries"].apply(lambda x: len(ast.literal_eval(x))).sum()
if pd.isnull(value):
value = 0
return value
[docs]def can_voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of canonical vocalizations for a given speaker type (based on vcm_type)
Required keyword arguments:
- speaker : speaker_type to use
"""
return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & (annotations["vcm_type"] == "C")].shape[
0]
peak_can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(peak_hour_metric()(can_voc_speaker))
can_voc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(per_hour_metric()(can_voc_speaker))
can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(can_voc_speaker)
[docs]def can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""total duration of canonical vocalizations by a given speaker type in milliseconds (based on vcm_type)
Required keyword arguments:
- speaker : speaker_type to use
"""
return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & (annotations["vcm_type"] == "C")][
"duration"].sum()
peak_can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})(
peak_hour_metric()(can_voc_dur_speaker))
can_voc_dur_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})(
per_hour_metric()(can_voc_dur_speaker))
can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})(can_voc_dur_speaker)
[docs]@metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"}, np.nan)
def avg_can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""average duration of canonical vocalizations for a given speaker type (based on vcm_type)
Required keyword arguments:
- speaker : speaker_type to use
"""
value = annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) & (annotations["vcm_type"] == "C")][
"duration"].mean()
if pd.isnull(value): value = 0
return value
[docs]def non_can_voc_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of non-canonical vocalizations for a given speaker type (based on vcm_type)
Required keyword arguments:
- speaker : speaker_type to use
"""
return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) &
(annotations["vcm_type"] == "N")].shape[0]
peak_non_can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(
peak_hour_metric()(non_can_voc_speaker))
non_can_voc_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(
per_hour_metric()(non_can_voc_speaker))
non_can_voc_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type"})(non_can_voc_speaker)
[docs]def non_can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""total duration of non-canonical vocalizations by a given speaker type in milliseconds (based on vcm_type)
Required keyword arguments:
- speaker : speaker_type to use
"""
return annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) &
(annotations["vcm_type"] == "N")]["duration"].sum()
peak_non_can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})(
peak_hour_metric()(non_can_voc_dur_speaker))
non_can_voc_dur_speaker_ph = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})(
per_hour_metric()(non_can_voc_dur_speaker))
non_can_voc_dur_speaker = metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"})(non_can_voc_dur_speaker)
[docs]@metricFunction({"speaker"}, {"speaker_type", "vcm_type", "duration"}, np.nan)
def avg_non_can_voc_dur_speaker(annotations: pd.DataFrame, duration: int, **kwargs):
"""average duration of non-canonical vocalizations for a given speaker type (based on vcm_type)
Required keyword arguments:
- speaker : speaker_type to use
"""
value = annotations.loc[(annotations["speaker_type"] == kwargs["speaker"]) &
(annotations["vcm_type"] == "N")]["duration"].mean()
if pd.isnull(value):
value = 0
return value
[docs]@metricFunction(set(), set(), np.nan)
def lp_n(annotations: pd.DataFrame, duration: int, **kwargs):
"""linguistic proportion on the number of vocalizations for CHI (based on vcm_type or [cries,vfxs,utterances_count] if vcm_type does not exist)
Required keyword arguments:
"""
if {"cries", "vfxs", "utterances_count"}.issubset(annotations.columns):
annotations = annotations[annotations["speaker_type"] == "CHI"]
cries = annotations["cries"].apply(lambda x: len(ast.literal_eval(x))).sum()
vfxs = annotations["vfxs"].apply(lambda x: len(ast.literal_eval(x))).sum()
utterances = annotations["utterances_count"].sum()
total = (utterances + cries + vfxs)
if total:
value = utterances / total
else:
value = np.nan
elif "vcm_type" in annotations.columns:
speech_voc = annotations.loc[(annotations["speaker_type"] == "CHI") &
(annotations["vcm_type"].isin(["N", "C"]))].shape[0]
cry_voc = annotations.loc[(annotations["speaker_type"] == "CHI") & (annotations["vcm_type"] == "Y")].shape[0]
total = speech_voc + cry_voc
if total:
value = speech_voc / total
else:
value = np.nan
else:
raise ValueError(
"the given set does not have the necessary columns for this metric, choose a set that contains either ["
"vcm_type] or [cries,vfxs,utterances_count]")
return value
[docs]@metricFunction(set(), {"speaker_type", "vcm_type"}, np.nan)
def cp_n(annotations: pd.DataFrame, duration: int, **kwargs):
"""canonical proportion on the number of vocalizations for CHI (based on vcm_type)
Required keyword arguments:
"""
speech_voc = annotations.loc[(annotations["speaker_type"] == "CHI") &
(annotations["vcm_type"].isin(["N", "C"]))].shape[0]
can_voc = annotations.loc[(annotations["speaker_type"] == "CHI") & (annotations["vcm_type"] == "C")].shape[0]
if speech_voc:
value = can_voc / speech_voc
else:
value = np.nan
return value
[docs]@metricFunction(set(), set(), np.nan)
def lp_dur(annotations: pd.DataFrame, duration: int, **kwargs):
"""linguistic proportion on the duration of vocalizations for CHI (based on vcm_type or [child_cry_vfxs_len,utterances_length] if vcm_type does not exist)
Required keyword arguments:
"""
if {"child_cry_vfx_len", "utterances_length"}.issubset(annotations.columns):
annotations = annotations[annotations["speaker_type"] == "CHI"]
utter_len = annotations["utterances_length"].sum()
total = annotations["child_cry_vfx_len"].sum() + utter_len
if total:
value = utter_len / total
else:
value = np.nan
elif "vcm_type" in annotations.columns:
speech_dur = annotations.loc[(annotations["speaker_type"] == "CHI") &
(annotations["vcm_type"].isin(["N", "C"]))]["duration"].sum()
cry_dur = annotations.loc[(annotations["speaker_type"] == "CHI") &
(annotations["vcm_type"] == "Y")]["duration"].sum()
total = speech_dur + cry_dur
if total:
value = speech_dur / total
else:
value = np.nan
else:
raise ValueError(
"the {} set does not have the necessary columns for this metric, choose a set that contains either ["
"vcm_type] or [child_cry_vfx_len,utterances_length]")
return value
[docs]@metricFunction(set(), {"speaker_type", "vcm_type", "duration"}, np.nan)
def cp_dur(annotations: pd.DataFrame, duration: int, **kwargs):
"""canonical proportion on the number of vocalizations for CHI (based on vcm_type)
Required keyword arguments:
"""
speech_dur = annotations.loc[(annotations["speaker_type"] == "CHI") &
(annotations["vcm_type"].isin(["N", "C"]))]["duration"].sum()
can_dur = annotations.loc[(annotations["speaker_type"] == "CHI") &
(annotations["vcm_type"] == "C")]["duration"].sum()
if speech_dur:
value = can_dur / speech_dur
else:
value = np.nan
return value
[docs]def lena_CVC(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of child vocalizations according to LENA's extraction
Required keyword arguments:
"""
return annotations["utterances_count"].sum()
peak_lena_CVC = metricFunction(set(), {"utterances_count"})(peak_hour_metric()(lena_CVC))
lena_CVC_ph = metricFunction(set(), {"utterances_count"})(per_hour_metric()(lena_CVC))
lena_CVC = metricFunction(set(), {"utterances_count"})(lena_CVC)
[docs]def lena_CTC(annotations: pd.DataFrame, duration: int, **kwargs):
"""number of conversational turn counts according to LENA's extraction
Required keyword arguments:
"""
conv_types = {'TIMR', 'TIFR'}
return annotations[annotations["lena_conv_turn_type"].isin(conv_types)].shape[0]
peak_lena_CTC = metricFunction(set(), {"lena_conv_turn_type"})(peak_hour_metric()(lena_CTC))
lena_CTC_ph = metricFunction(set(), {"lena_conv_turn_type"})(per_hour_metric()(lena_CTC))
lena_CTC = metricFunction(set(), {"lena_conv_turn_type"})(lena_CTC)
[docs]def simple_CTC(annotations: pd.DataFrame,
duration: int,
interlocutors_1=('CHI',),
interlocutors_2=('FEM', 'MAL', 'OCH'),
max_interval=1000,
min_delay=0,
**kwargs):
"""number of conversational turn counts based on vocalizations occurring
in a given interval of one another
keyword arguments:
- interlocutors_1 : first group of interlocutors, default = ['CHI']
- interlocutors_2 : second group of interlocutors, default = ['FEM','MAL','OCH']
- max_interval : maximum interval in ms for it to be considered a turn, default = 1000
- min_delay : minimum delay between somebody starting speaking
"""
# build the interactants groups, every label in interlocutors_1 can interact with interlocutors_2 and vice versa
speakers = set(interlocutors_1 + interlocutors_2)
interactants = {k: set(interlocutors_2) for k in interlocutors_1}
for k in interlocutors_2:
if k in interactants:
interactants[k] = interactants[k] | set(interlocutors_1)
else:
interactants[k] = set(interlocutors_1)
annotations = annotations[annotations["speaker_type"].isin(speakers)].copy()
if annotations.shape[0]:
# store the duration between vocalizations
annotations["iti"] = annotations["segment_onset"] - annotations["segment_offset"].shift(1)
# store the previous speaker
annotations["prev_speaker_type"] = annotations["speaker_type"].shift(1)
annotations["delay"] = annotations["segment_onset"] - annotations["segment_onset"].shift(1)
# not using absolute value for 'iti' is a choice and should be evaluated (we allow speakers to 'interrupt'
# themselves
annotations["is_CT"] = (
(annotations.apply(lambda row: row["prev_speaker_type"] in interactants[row['speaker_type']], axis=1))
&
(annotations['iti'] < max_interval)
&
(annotations['delay'] >= min_delay)
)
return annotations['is_CT'].sum()
else:
return 0
peak_simple_CTC = metricFunction(set(), {"speaker_type"})(peak_hour_metric()(simple_CTC))
simple_CTC_ph = metricFunction(set(), {"speaker_type"})(per_hour_metric()(simple_CTC))
simple_CTC = metricFunction(set(), {"speaker_type"})(simple_CTC)