#!/usr/bin/env python3 # # This python script extracts audio clips (e.g., from wav or flac files) based on a set of input annotation tables (Raven style). # Annotation tables should be in CSV, TSV or Excel format (the format is automatically detected based on file extension). # Example usage: # extract_audio_clips.py -i "my_annotations/*.txt" -f -v --relative_audio_filepaths -o extracted_clips --output_format "auto" # # Tested using Python 3.11, Mac OS 13.6.5. # # Stilianos Louca # Copyright 2024 # # LICENSE AGREEMENT # - - - - - - - - - # All rights reserved. # Use and redistributions of this code is permitted for commercial and non-commercial purposes, # under the following conditions: # # * Redistributions must retain the above copyright notice, this list of # conditions and the following disclaimer in the code itself, as well # as in documentation and/or other materials provided with the code. # * Neither the name of the original author (Stilianos Louca), nor the names # of its contributors may be used to endorse or promote products derived # from this code without specific prior written permission. # * Proper attribution must be given to the original author, including a # reference to any peer-reviewed publication through which the code was published. # # THIS CODE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. # IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS CODE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # - - - - - - - - - import time import argparse import librosa # for loading non-wav files import soundfile # for writing non-wav files import scipy import math import glob import os, sys import pandas import numpy from collections import defaultdict pandas.set_option('display.max_rows', None) pandas.set_option('display.min_rows', None) pandas.set_option('display.max_colwidth', None) FILE_EXTENSION_TO_COLUMN_DELIMITER = {"txt":"\t", "tsv":"\t", "tab":"\t", "csv":",", "dnl":"\t"} ########################################### # AUXILIARY FUNCTIONS # get the right-most extension of a file # if skip_gz==True, and the file name ends with ".gz", then the ".gz" part is ignored (for example "data.tsv.gz" yields "tsv"). def file_extension(file_path, skip_gz): if(skip_gz and file_path.lower().endswith(".gz")): file_path = file_path[:-3] extension = file_path.rsplit(".",1)[-1] return extension # guess the column delimiter of a classical table file, based on the file extension def infer_file_delimiter(file_path, default=" "): extension = file_extension(file_path, skip_gz=True).lower() return FILE_EXTENSION_TO_COLUMN_DELIMITER.get(extension, default) # convert an integer to a string, padded with as many zeros as needed for obtaining the same total width up until a specific maximum value # for example int2zero_padded_str(3,78) yields "03", and int2zero_padded_str(3,1673) yields "0003". def int2zero_padded_str(value,max_value): return ("%0"+str(1+int(math.log10(max(1,abs(max_value)))))+"d")%(value) ########################################### # MAIN BODY # parse command line arguments parser = argparse.ArgumentParser(description="Extract audio clips based on annotation tables.", epilog="") parser.add_argument('-i','--input_tables', required=True, type=str, help="Paths or shell wildcards to input annotation tables, in CSV/TSV/Excel format. Multiple paths must be separated by a colon.") parser.add_argument('-o','--output_dir', required=True, type=str, help="Path to output directory where all extracted clips should be saved to."); parser.add_argument('-r','--relative_audio_filepaths', action='store_true', dest="relative_audio_filepaths", default=False, help="Auto file paths listed in an input annotation table should be sought relative to that table, i.e., solely based on the audio file's name, even if a full path is given. This may be useful if the whole collection of audio files & annotation tables was moved since the annotation tables were created."); parser.add_argument('--output_format', default="auto", choices=["wav","flac","ogg","auto"], help="Output audio format. If 'auto', then output file formats are the same as the input audio format. (default: '%(default)s)'"); parser.add_argument('--clip_filenames', default="audio_name_and_selection_name", choices=["audio_name_and_selection_name","enumerate"], help="How to determine the output file names for the extracted clips. 'enumerate' means clips are named numerically in the order in which they are encountered across all files; this erases any information on their origin. (default: '%(default)s)'"); parser.add_argument('--clip_filename_delimiter', default=".", type=str, help="Delimiter to use for constructing the output clip filenames. If this is '/', then it effectively acts as a directory deepener. (default: '%(default)s)'"); parser.add_argument('-f','--force', action='store_true', dest="force", default=False, help='Replace existing output files without asking.'); parser.add_argument('--verbose_prefix', default=" ", help="Line prefix to be used for standard output messages. This may be useful if the script is part of another pipeline. (default: '%(default)s)'"); parser.add_argument('-v','--verbose', action='store_true', dest="verbose", default=False, help='Show lots of information.'); args = parser.parse_args() def abort(message, exit_code=1): print(message) sys.exit(exit_code) # find input annotation tables table_paths = [path for path_spec in args.input_tables.split(":") for path in glob.glob(path_spec)] if(args.verbose): print("%sNote: Found %d input annotation tables"%(args.verbose_prefix,len(table_paths))) if(len(table_paths)==0): abort("%sNothing to be done"%(args.verbose_prefix), 0) # load & merge all annotation tables into a master table tables = [None]*len(table_paths) input_dtypes = defaultdict(lambda: str, {"Begin Time (s)":float, "End Time (s)":float}) for t,table_path in enumerate(table_paths): if(table_path.lower().endswith(".xlsx")): table = pandas.read_excel(table_path, dtype=input_dtypes, na_filter=False) else: table = pandas.read_csv(table_path, delimiter=infer_file_delimiter(table_path, default="\t"), dtype=input_dtypes, na_filter=False) table["table_path"] = table_path tables[t] = table table = pandas.concat(tables, axis=0, join='outer', ignore_index=True, copy=False) del tables if(args.verbose): print("%sNote: Merged annotation master table has %d rows x %d columns"%(args.verbose_prefix,table.shape[0],table.shape[1])) # determine input-audio filepaths table["audio_path"] = table["Begin File"] if(args.relative_audio_filepaths): table["audio_path"] = [os.path.join(os.path.dirname(table_path),os.path.basename(audio_path)) for audio_path,table_path in zip(table["audio_path"],table["table_path"])] if(args.verbose): print("%sNote: Annotations cover %d unique audio input files"%(args.verbose_prefix,len(set(table["audio_path"])))) table.sort_values(by="audio_path", axis=0, inplace=True, ignore_index=True) # sort by audio path, to facillitate non-redundant audio loading later on # determine output clip filenames & filepaths if(args.clip_filenames=="audio_name_and_selection_name"): table["clip_filepath"] = [os.path.join(args.output_dir,os.path.splitext(os.path.basename(audio_path))[0]+args.clip_filename_delimiter+selection_name+"."+args.output_format) for audio_path,selection_name in zip(table["audio_path"],table["Selection"])] elif(args.clip_filenames=="enumerate"): table["clip_filepath"] = [os.path.join(args.output_dir,int2zero_padded_str(1+k,1+table.shape[0])+"."+args.output_format) for k in range(table.shape[0])] if(args.verbose): print("%sExtracting %d clips and saving them as wavs.."%(args.verbose_prefix,table.shape[0])) Ndone = 0 previous_audio_path = None for r,row in table.iterrows(): if((not args.force) and os.path.exists(row["clip_filepath"])): abort("%s ERROR: Output file '%s' already exists. Cowardly refusing to continue"%(args.verbose_prefix,row["clip_filepath"])) if(row["audio_path"].lower().endswith(".wav")): # load audio with scipy and cut out the clip that we want. Note that scipy is faster than librosa & soundfile, at least for small files # we only load the audio if the path differs from the previous one if((previous_audio_path is None) or (row["audio_path"]!=previous_audio_path)): sampling_rate, audio = scipy.io.wavfile.read(row["audio_path"]) clip = audio[int(row["Begin Time (s)"]*sampling_rate):int(row["End Time (s)"]*sampling_rate+1),...] else: # load non-wav audio using librosa, since scipy only supports wav # Note that librosa is much slower than scipy (at least for short audios), and automatically rescales the audio signal to fit within the range -1 to +1. # A benefit of librosa is that it allows specifying the start & end position to load, whereas scipy first needs to load the whole audio file clip, sampling_rate = librosa.load(row["audio_path"],offset=row["Begin Time (s)"],duration=row["End Time (s)"]-row["Begin Time (s)"], sr=None) previous_audio_path = row["audio_path"] os.makedirs(os.path.dirname(row["clip_filepath"]), exist_ok=True) output_format = (os.path.splitext(row["audio_path"])[1].lower()[1:] if (args.output_format=="auto") else args.output_format) if(output_format=="wav"): # for wav, we use scipy, which is a bit faster than soundfile scipy.io.wavfile.write(row["clip_filepath"], rate=sampling_rate, data=clip) else: # use soundfile to save the clip in non-wav format soundfile.write(file=row["clip_filepath"], data=clip, samplerate=sampling_rate, format=output_format) Ndone += 1 if(args.verbose and ((Ndone % 100)==0)): print("%s Note: Done extracting %d out of %d clips"%(args.verbose_prefix,Ndone,table.shape[0])) if(args.verbose): print("%sDone. Extracted %d clips"%(args.verbose_prefix,Ndone))