Source code for ChildProject.pipelines.eafbuilder

import argparse
import pandas as pd
import sys
import os
import shutil

from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager
from ChildProject.pipelines.pipeline import Pipeline
from ChildProject.tables import assert_dataframe, assert_columns_presence
from ChildProject.converters import Formats

FORMAT_SPEECH = {Formats.VTC.value,Formats.VCM.value} #formats for which nan must be replaced with SPEECH
CHILDPROJECT_TYPE = "childProject_generated"

[docs]def create_eaf(
    etf_path: str,
    id: str,
    output_dir: str,
    recording_filename: str,
    timestamps_list: list,
    eaf_type: str,
    contxt_on: int,
    contxt_off: int,
    template: str,
    speech_segments: pd.DataFrame = None,
    imported_set: str = None,
    imported_format: str = None,
):
    import pympi

    eaf = pympi.Elan.Eaf(etf_path)
    
    #create a default ling_type for generated tiers to be always time-aligneable
    eaf.add_linguistic_type(CHILDPROJECT_TYPE, timealignable=True)
    
    eaf.add_tier("SAMPLER", ling=CHILDPROJECT_TYPE)
    eaf.add_tier("code_" + eaf_type, ling=CHILDPROJECT_TYPE, parent="SAMPLER")
    eaf.add_tier("context_" + eaf_type, ling=CHILDPROJECT_TYPE, parent="SAMPLER")
    eaf.add_tier("code_num_" + eaf_type, ling=CHILDPROJECT_TYPE, parent="SAMPLER")

    for i, ts in enumerate(timestamps_list):
        print("Creating eaf code segment # ", i + 1)
        print("enumerate makes: ", i, ts)
        whole_region_onset = int(ts[0])
        whole_region_offset = int(ts[1])

        context_onset = int(whole_region_onset - contxt_on)
        context_offset = int(whole_region_offset + contxt_off)

        if context_onset < 0:
            context_onset = 0

        codeNumVal = eaf_type + str(i + 1)
        eaf.add_annotation("code_" + eaf_type, whole_region_onset, whole_region_offset)
        eaf.add_annotation(
            "code_num_" + eaf_type,
            whole_region_onset,
            whole_region_offset,
            value=codeNumVal,
        )
        eaf.add_annotation("context_" + eaf_type, context_onset, context_offset)

    if speech_segments is not None:
        for segment in speech_segments.to_dict(orient="records"):
            speaker_id = None

            if "speaker_id" in segment:
                speaker_id = segment["speaker_id"]
            elif "speaker_type" in segment:
                speaker_id = segment["speaker_type"]
                if pd.isnull(speaker_id) and imported_format in FORMAT_SPEECH : speaker_id = "SPEECH" #replace  nan with SPEECH for some formats

            if speaker_id is None:
                continue
            
            if imported_set: speaker_id = "{}-{}".format(imported_set.replace("/","_").upper(),speaker_id)

            if speaker_id not in eaf.tiers:
                eaf.add_tier(speaker_id, ling=CHILDPROJECT_TYPE)

            eaf.add_annotation(
                speaker_id,
                int(segment["segment_onset"]),
                int(segment["segment_offset"]),
            )

    destination = os.path.join(output_dir, "{}.eaf".format(id))
    os.makedirs(os.path.dirname(destination), exist_ok=True)

    mime_type = "audio/x-wav"
    mime_types = {"mp3": "audio/mpeg", "mp4": "audio/mp4", "flac": "audio/x-flac"}
    mime_types.update(eaf.MIMES)

    extension = os.path.splitext(recording_filename)[1]
    if extension:
        extension = extension.lower()
        if extension in mime_types:
            mime_type = mime_types[extension[1:]]

    eaf.add_linked_file(
        file_path=recording_filename, relpath=recording_filename, mimetype=mime_type
    )

    eaf.to_file(destination)
    for i in eaf.get_tier_names():
        print(i, ":", eaf.get_annotation_data_for_tier(i))

    return eaf


[docs]class EafBuilderPipeline(Pipeline):
    def __init__(self):
        pass

[docs]    def run(
        self,
        destination: str,
        segments: str,
        eaf_type: str,
        template: str,
        context_onset: int = 0,
        context_offset: int = 0,
        path: str = None,
        import_speech_from: str = None,
        **kwargs,
    ):
        """generate .eaf templates based on intervals to code.

        :param path: project path
        :type path: str
        :param destination: eaf destination
        :type destination: str
        :param segments: path to the input segments dataframe
        :type segments: str
        :param eaf_type: eaf-type [random, periodic]
        :type eaf_type: str
        :param template: name of the template to use (basic, native, or non-native)
        :type template: str
        :param context_onset: context onset and segment offset difference in milliseconds, 0 for no introductory context
        :type context_onset: int
        :param context_offset: context offset and segment offset difference in milliseconds, 0 for no outro context
        :type context_offset: int
        """

        try:
            from importlib import resources
        except ImportError:
            # TODO: Perhaps add this as a dependency to the resources?
            import importlib_resources as resources

        etf_path = "{}.etf".format(template)
        pfsx_path = "{}.pfsx".format(template)

        if template in ["basic", "native", "non-native"]:
            with resources.path("ChildProject.templates", etf_path) as etf:
                etf_path = str(etf)

            with resources.path("ChildProject.templates", pfsx_path) as pfsx:
                pfsx_path = str(pfsx)

        if not os.path.exists(etf_path):
            raise Exception("{} cannot be found".format(etf_path))

        if not os.path.exists(pfsx_path):
            raise Exception("{} cannot be found".format(pfsx_path))

        print("making the " + eaf_type + " eaf file and csv")

        segments = pd.read_csv(segments)

        assert_dataframe("segments", segments, not_empty=True)
        assert_columns_presence(
            "segments",
            segments,
            {"recording_filename", "segment_onset", "segment_offset"},
        )

        imported_set = None
        prefill = path and import_speech_from
        if prefill:
            project = ChildProject(path)
            am = AnnotationManager(project)
            am.read()
            imported_set = import_speech_from

        for recording_filename, segs in segments.groupby("recording_filename"):
            recording_prefix = os.path.splitext(recording_filename)[0]
            output_filename = (
                recording_prefix + "_" + eaf_type + "_" + os.path.basename(template)
            )

            # TODO: This list of timestamps as tuples might not be ideal/should perhaps be optimized, but I am just replicating the original eaf creation code here.
            timestamps = [
                (on, off)
                for on, off in segs.loc[:, ["segment_onset", "segment_offset"]].values
            ]

            speech_segments = None
            imported_format = None
            if prefill:
                ranges = segs.assign(recording_filename=recording_filename).rename(
                    columns={
                        "segment_onset": "range_onset",
                        "segment_offset": "range_offset",
                    }
                )
                matches = am.get_within_ranges(ranges, [import_speech_from], 'warn')

                if len(matches) == 0:
                    continue

                speech_segments = am.get_segments(matches)
                try:
                    matches = matches["format"].drop_duplicates()
                    if len(matches.index) == 1:
                        imported_format = matches.iloc[0]
                except KeyError:
                    imported_format = None
                    

            output_dir = os.path.join(destination, recording_prefix)

            create_eaf(
                etf_path,
                output_filename,
                output_dir,
                recording_filename,
                timestamps,
                eaf_type,
                context_onset,
                context_offset,
                template,
                speech_segments,
                imported_set,
                imported_format,
            )

            shutil.copy(
                pfsx_path, os.path.join(output_dir, "{}.pfsx".format(output_filename))
            )

[docs]    @staticmethod
    def setup_parser(parser):
        parser.add_argument("--destination", help="eaf destination")
        parser.add_argument(
            "--segments", help="path to the input segments dataframe", required=True
        )
        # TODO: add other options here such as high-volubility, energy, etc.?
        parser.add_argument(
            "--eaf-type",
            help="eaf-type",
            choices=["random", "periodic", "high-volubility", "energy-detection"],
            required=True,
        )
        parser.add_argument(
            "--template",
            help="Which ACLEW templates (basic, native or non-native); otherwise, the path to the etf et pfsx templates, without the extension.",
            required=True,
        )
        parser.add_argument(
            "--context-onset",
            help="context onset and segment offset difference in milliseconds, 0 for no introductory context",
            type=int,
            default=0,
        )
        parser.add_argument(
            "--context-offset",
            help="context offset and segment offset difference in milliseconds, 0 for no outro context",
            type=int,
            default=0,
        )
        parser.add_argument(
            "--path",
            help="path to the input dataset. Required together with --import-speech-from for pre-filling the .eaf",
            required=False,
        )
        parser.add_argument(
            "--import-speech-from",
            help="set of annotations from which speech segments should be imported in order to pre-fill the annotations.",
            required=False,
        )