#!/usr/bin/env python3
#
# This python script extracts audio clips (e.g., from wav or flac files) based on a set of input annotation tables (Raven style).
# Annotation tables should be in CSV, TSV or Excel format (the format is automatically detected based on file extension).
# Example usage:
#	extract_audio_clips.py -i "my_annotations/*.txt" -f -v --relative_audio_filepaths -o extracted_clips --output_format "auto"
#
# Tested using Python 3.11, Mac OS 13.6.5.
#
# Stilianos Louca
# Copyright 2024
#
# LICENSE AGREEMENT
# - - - - - - - - -
# All rights reserved.
# Use and redistributions of this code is permitted for commercial and non-commercial purposes,
# under the following conditions:
#
#	* Redistributions must retain the above copyright notice, this list of 
#	  conditions and the following disclaimer in the code itself, as well 
#	  as in documentation and/or other materials provided with the code.
#	* Neither the name of the original author (Stilianos Louca), nor the names 
#	  of its contributors may be used to endorse or promote products derived 
#	  from this code without specific prior written permission.
#	* Proper attribution must be given to the original author, including a 
#     reference to any peer-reviewed publication through which the code was published.
#
# THIS CODE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS "AS IS" AND ANY 
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
# IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS CODE, 
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# - - - - - - - - -

import time
import argparse
import librosa	 # for loading non-wav files
import soundfile # for writing non-wav files
import scipy
import math
import glob
import os, sys
import pandas
import numpy
from collections import defaultdict

pandas.set_option('display.max_rows', None)
pandas.set_option('display.min_rows', None)
pandas.set_option('display.max_colwidth', None)

FILE_EXTENSION_TO_COLUMN_DELIMITER = {"txt":"\t", "tsv":"\t", "tab":"\t", "csv":",", "dnl":"\t"}

###########################################
# AUXILIARY FUNCTIONS

# get the right-most extension of a file
# if skip_gz==True, and the file name ends with ".gz", then the ".gz" part is ignored (for example "data.tsv.gz" yields "tsv").
def file_extension(file_path, skip_gz):
	if(skip_gz and file_path.lower().endswith(".gz")): file_path = file_path[:-3]
	extension = file_path.rsplit(".",1)[-1]
	return extension


# guess the column delimiter of a classical table file, based on the file extension
def infer_file_delimiter(file_path, default=" "):
	extension = file_extension(file_path, skip_gz=True).lower()
	return FILE_EXTENSION_TO_COLUMN_DELIMITER.get(extension, default)


# convert an integer to a string, padded with as many zeros as needed for obtaining the same total width up until a specific maximum value
# for example int2zero_padded_str(3,78) yields "03", and int2zero_padded_str(3,1673) yields "0003".
def int2zero_padded_str(value,max_value):
	return ("%0"+str(1+int(math.log10(max(1,abs(max_value)))))+"d")%(value)


###########################################
# MAIN BODY

# parse command line arguments
parser = argparse.ArgumentParser(description="Extract audio clips based on annotation tables.", epilog="")
parser.add_argument('-i','--input_tables', required=True, type=str, help="Paths or shell wildcards to input annotation tables, in CSV/TSV/Excel format. Multiple paths must be separated by a colon.")
parser.add_argument('-o','--output_dir', required=True, type=str, help="Path to output directory where all extracted clips should be saved to.");

parser.add_argument('-r','--relative_audio_filepaths', action='store_true', dest="relative_audio_filepaths", default=False, help="Auto file paths listed in an input annotation table should be sought relative to that table, i.e., solely based on the audio file's name, even if a full path is given. This may be useful if the whole collection of audio files & annotation tables was moved since the annotation tables were created.");

parser.add_argument('--output_format', default="auto", choices=["wav","flac","ogg","auto"], help="Output audio format. If 'auto', then output file formats are the same as the input audio format. (default: '%(default)s)'");
parser.add_argument('--clip_filenames', default="audio_name_and_selection_name", choices=["audio_name_and_selection_name","enumerate"], help="How to determine the output file names for the extracted clips. 'enumerate' means clips are named numerically in the order in which they are encountered across all files; this erases any information on their origin. (default: '%(default)s)'");
parser.add_argument('--clip_filename_delimiter', default=".", type=str, help="Delimiter to use for constructing the output clip filenames. If this is '/', then it effectively acts as a directory deepener. (default: '%(default)s)'");

parser.add_argument('-f','--force', action='store_true', dest="force", default=False, help='Replace existing output files without asking.');
parser.add_argument('--verbose_prefix', default="  ", help="Line prefix to be used for standard output messages. This may be useful if the script is part of another pipeline. (default: '%(default)s)'");
parser.add_argument('-v','--verbose', action='store_true', dest="verbose", default=False, help='Show lots of information.');
args = parser.parse_args()

def abort(message, exit_code=1):
	print(message)
	sys.exit(exit_code)

# find input annotation tables
table_paths = [path for path_spec in args.input_tables.split(":") for path in glob.glob(path_spec)]
if(args.verbose): print("%sNote: Found %d input annotation tables"%(args.verbose_prefix,len(table_paths)))
if(len(table_paths)==0): abort("%sNothing to be done"%(args.verbose_prefix), 0)

# load & merge all annotation tables into a master table
tables = [None]*len(table_paths)
input_dtypes = defaultdict(lambda: str, {"Begin Time (s)":float, "End Time (s)":float})
for t,table_path in enumerate(table_paths):
	if(table_path.lower().endswith(".xlsx")):
		table = pandas.read_excel(table_path, dtype=input_dtypes, na_filter=False)
	else:
		table = pandas.read_csv(table_path, delimiter=infer_file_delimiter(table_path, default="\t"), dtype=input_dtypes, na_filter=False)
	table["table_path"] = table_path
	tables[t] = table
table = pandas.concat(tables, axis=0, join='outer', ignore_index=True, copy=False)
del tables
if(args.verbose): print("%sNote: Merged annotation master table has %d rows x %d columns"%(args.verbose_prefix,table.shape[0],table.shape[1]))

# determine input-audio filepaths
table["audio_path"] = table["Begin File"]
if(args.relative_audio_filepaths):
	table["audio_path"] = [os.path.join(os.path.dirname(table_path),os.path.basename(audio_path)) for audio_path,table_path in zip(table["audio_path"],table["table_path"])]
if(args.verbose): print("%sNote: Annotations cover %d unique audio input files"%(args.verbose_prefix,len(set(table["audio_path"]))))
table.sort_values(by="audio_path", axis=0, inplace=True, ignore_index=True) # sort by audio path, to facillitate non-redundant audio loading later on

# determine output clip filenames & filepaths
if(args.clip_filenames=="audio_name_and_selection_name"):
	table["clip_filepath"] = [os.path.join(args.output_dir,os.path.splitext(os.path.basename(audio_path))[0]+args.clip_filename_delimiter+selection_name+"."+args.output_format) for audio_path,selection_name in zip(table["audio_path"],table["Selection"])]
elif(args.clip_filenames=="enumerate"):
	table["clip_filepath"] = [os.path.join(args.output_dir,int2zero_padded_str(1+k,1+table.shape[0])+"."+args.output_format) for k in range(table.shape[0])]

if(args.verbose): print("%sExtracting %d clips and saving them as wavs.."%(args.verbose_prefix,table.shape[0]))
Ndone = 0
previous_audio_path = None
for r,row in table.iterrows():
	if((not args.force) and os.path.exists(row["clip_filepath"])): abort("%s  ERROR: Output file '%s' already exists. Cowardly refusing to continue"%(args.verbose_prefix,row["clip_filepath"]))
	if(row["audio_path"].lower().endswith(".wav")):
		# load audio with scipy and cut out the clip that we want. Note that scipy is faster than librosa & soundfile, at least for small files
		# we only load the audio if the path differs from the previous one
		if((previous_audio_path is None) or (row["audio_path"]!=previous_audio_path)):
			sampling_rate, audio = scipy.io.wavfile.read(row["audio_path"])
		clip = audio[int(row["Begin Time (s)"]*sampling_rate):int(row["End Time (s)"]*sampling_rate+1),...]
	else:
		# load non-wav audio using librosa, since scipy only supports wav
		# Note that librosa is much slower than scipy (at least for short audios), and automatically rescales the audio signal to fit within the range -1 to +1.
		# A benefit of librosa is that it allows specifying the start & end position to load, whereas scipy first needs to load the whole audio file
		clip, sampling_rate = librosa.load(row["audio_path"],offset=row["Begin Time (s)"],duration=row["End Time (s)"]-row["Begin Time (s)"], sr=None)
	previous_audio_path = row["audio_path"]
	os.makedirs(os.path.dirname(row["clip_filepath"]), exist_ok=True)
	output_format = (os.path.splitext(row["audio_path"])[1].lower()[1:] if (args.output_format=="auto") else args.output_format)
	if(output_format=="wav"):
		# for wav, we use scipy, which is a bit faster than soundfile
		scipy.io.wavfile.write(row["clip_filepath"], rate=sampling_rate, data=clip)
	else:
		# use soundfile to save the clip in non-wav format
		soundfile.write(file=row["clip_filepath"], data=clip, samplerate=sampling_rate, format=output_format)
	Ndone += 1
	if(args.verbose and ((Ndone % 100)==0)): print("%s  Note: Done extracting %d out of %d clips"%(args.verbose_prefix,Ndone,table.shape[0]))

if(args.verbose): print("%sDone. Extracted %d clips"%(args.verbose_prefix,Ndone))