Source code for ChildProject.pipelines.eafbuilder

import argparse
import pandas as pd
import sys
import os
import shutil

from ChildProject.projects import ChildProject
from ChildProject.pipelines.pipeline import Pipeline
from ChildProject.tables import assert_dataframe, assert_columns_presence


[docs]def create_eaf( etf_path: str, id: str, output_dir: str, recording_filename: str, timestamps_list: list, eaf_type: str, contxt_on: int, contxt_off: int, template: str, ): import pympi eaf = pympi.Elan.Eaf(etf_path) ling_type = "transcription" eaf.add_tier("code_" + eaf_type, ling=ling_type) eaf.add_tier("context_" + eaf_type, ling=ling_type) eaf.add_tier("code_num_" + eaf_type, ling=ling_type) for i, ts in enumerate(timestamps_list): print("Creating eaf code segment # ", i + 1) print("enumerate makes: ", i, ts) whole_region_onset = int(ts[0]) whole_region_offset = int(ts[1]) context_onset = int(whole_region_onset - contxt_on) context_offset = int(whole_region_offset + contxt_off) if context_onset < 0: context_onset = 0 codeNumVal = eaf_type + str(i + 1) eaf.add_annotation("code_" + eaf_type, whole_region_onset, whole_region_offset) eaf.add_annotation( "code_num_" + eaf_type, whole_region_onset, whole_region_offset, value=codeNumVal, ) eaf.add_annotation("context_" + eaf_type, context_onset, context_offset) destination = os.path.join(output_dir, "{}.eaf".format(id)) os.makedirs(os.path.dirname(destination), exist_ok=True) mime_type = "audio/x-wav" mime_types = {"mp3": "audio/mpeg", "mp4": "audio/mp4", "flac": "audio/x-flac"} mime_types.update(eaf.MIMES) extension = os.path.splitext(recording_filename)[1] if extension: extension = extension.lower() if extension in mime_types: mime_type = mime_types[extension[1:]] eaf.add_linked_file( file_path=recording_filename, relpath=recording_filename, mimetype=mime_type ) eaf.to_file(destination) for i in eaf.get_tier_names(): print(i, ":", eaf.get_annotation_data_for_tier(i)) return eaf
[docs]class EafBuilderPipeline(Pipeline): def __init__(self): pass
[docs] def run( self, destination: str, segments: str, eaf_type: str, template: str, context_onset: int = 0, context_offset: int = 0, **kwargs ): """generate .eaf templates based on intervals to code. :param path: project path :type path: str :param destination: eaf destination :type destination: str :param segments: path to the input segments dataframe :type segments: str :param eaf_type: eaf-type [random, periodic] :type eaf_type: str :param template: name of the template to use (basic, native, or non-native) :type template: str :param context_onset: context onset and segment offset difference in milliseconds, 0 for no introductory context :type context_onset: int :param context_offset: context offset and segment offset difference in milliseconds, 0 for no outro context :type context_offset: int """ try: from importlib import resources except ImportError: # TODO: Perhaps add this as a dependency to the resources? import importlib_resources as resources etf_path = "{}.etf".format(template) pfsx_path = "{}.pfsx".format(template) if template in ["basic", "native", "non-native"]: with resources.path("ChildProject.templates", etf_path) as etf: etf_path = str(etf) with resources.path("ChildProject.templates", pfsx_path) as pfsx: pfsx_path = str(pfsx) if not os.path.exists(etf_path): raise Exception("{} cannot be found".format(etf_path)) if not os.path.exists(pfsx_path): raise Exception("{} cannot be found".format(pfsx_path)) print("making the " + eaf_type + " eaf file and csv") segments = pd.read_csv(segments) assert_dataframe("segments", segments, not_empty=True) assert_columns_presence( "segments", segments, {"recording_filename", "segment_onset", "segment_offset"}, ) for recording_filename, segs in segments.groupby("recording_filename"): recording_prefix = os.path.splitext(recording_filename)[0] output_filename = ( recording_prefix + "_" + eaf_type + "_" + os.path.basename(template) ) # TODO: This list of timestamps as tuples might not be ideal/should perhaps be optimized, but I am just replicating the original eaf creation code here. timestamps = [ (on, off) for on, off in segs.loc[:, ["segment_onset", "segment_offset"]].values ] output_dir = os.path.join(destination, recording_prefix) create_eaf( etf_path, output_filename, output_dir, recording_filename, timestamps, eaf_type, context_onset, context_offset, template, ) shutil.copy( pfsx_path, os.path.join(output_dir, "{}.pfsx".format(output_filename)) )
[docs] @staticmethod def setup_parser(parser): parser.add_argument("--destination", help="eaf destination") parser.add_argument( "--segments", help="path to the input segments dataframe", required=True ) # TODO: add other options here such as high-volubility, energy, etc.? parser.add_argument( "--eaf-type", help="eaf-type", choices=["random", "periodic", "high-volubility", "energy-detection"], required=True, ) parser.add_argument( "--template", help="Which ACLEW templates (basic, native or non-native); otherwise, the path to the etf et pfsx templates, without the extension.", required=True, ) parser.add_argument( "--context-onset", help="context onset and segment offset difference in milliseconds, 0 for no introductory context", type=int, default=0, ) parser.add_argument( "--context-offset", help="context offset and segment offset difference in milliseconds, 0 for no outro context", type=int, default=0, )