Source code for sox.transform
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Python wrapper around the SoX library.
This module requires that SoX is installed.
'''
from __future__ import print_function
import os
import random
from pathlib import Path
from typing import List, Optional, Dict, Union, Tuple
import numpy as np
from typing_extensions import Literal
from . import file_info
from .core import ENCODING_VALS, EncodingValue
from .core import SoxError
from .core import VALID_FORMATS
from .core import is_number
from .core import play
from .core import sox
from .log import logger
VERBOSITY_VALS = [0, 1, 2, 3, 4]
ENCODINGS_MAPPING = {
np.int16: 's16',
np.int8: 's8',
np.float32: 'f32',
np.float64: 'f64',
}
GainType = Literal['amplitude', 'power', 'db']
[docs]class Transformer:
'''Audio file transformer.
Class which allows multiple effects to be chained to create an output
file, saved to output_filepath.
Methods
-------
set_globals
Overwrite the default global arguments.
build
Execute the current chain of commands to create an output file.
build_file
Alias of build.
build_array
Execute the current chain of commands to create an output array.
'''
def __init__(self):
'''
Attributes
----------
input_format : list of str
Input file format arguments that will be passed to SoX.
output_format : list of str
Output file format arguments that will be bassed to SoX.
effects : list of str
Effects arguments that will be passed to SoX.
effects_log : list of str
Ordered sequence of effects applied.
globals : list of str
Global arguments that will be passed to SoX.
'''
self.input_format = {} # type: Dict
self.output_format = {} # type : Dict
self.effects = [] # type: List[str]
self.effects_log = [] # type: List[str]
self.globals = [] # type: List[str]
self.set_globals()
[docs] def set_globals(self, dither: bool = False,
guard: bool = False,
multithread: bool = False,
replay_gain: bool = False,
verbosity: int = 2):
'''Sets SoX's global arguments.
Overwrites any previously set global arguments.
If this function is not explicity called, globals are set to this
function's defaults.
Parameters
----------
dither : bool, default=False
If True, dithering is applied for low files with low bit rates.
guard : bool, default=False
If True, invokes the gain effect to guard against clipping.
multithread : bool, default=False
If True, each channel is processed in parallel.
replay_gain : bool, default=False
If True, applies replay-gain adjustment to input-files.
verbosity : int, default=2
SoX's verbosity level. One of:
* 0 : No messages are shown at all
* 1 : Only error messages are shown. These are generated if SoX
cannot complete the requested commands.
* 2 : Warning messages are also shown. These are generated if
SoX can complete the requested commands, but not exactly
according to the requested command parameters, or if
clipping occurs.
* 3 : Descriptions of SoX’s processing phases are also shown.
Useful for seeing exactly how SoX is processing your audio.
* 4, >4 : Messages to help with debugging SoX are also shown.
'''
if not isinstance(dither, bool):
raise ValueError('dither must be a boolean.')
if not isinstance(guard, bool):
raise ValueError('guard must be a boolean.')
if not isinstance(multithread, bool):
raise ValueError('multithread must be a boolean.')
if not isinstance(replay_gain, bool):
raise ValueError('replay_gain must be a boolean.')
if verbosity not in VERBOSITY_VALS:
raise ValueError(
'Invalid value for VERBOSITY. Must be one {}'.format(
VERBOSITY_VALS)
)
global_args = []
if not dither:
global_args.append('-D')
if guard:
global_args.append('-G')
if multithread:
global_args.append('--multi-threaded')
if replay_gain:
global_args.append('--replay-gain')
global_args.append('track')
global_args.append('-V{}'.format(verbosity))
self.globals = global_args
return self
def _validate_input_format(self, input_format):
'''Private helper function for validating input formats
'''
file_type = input_format.get('file_type')
rate = input_format.get('rate')
bits = input_format.get('bits')
channels = input_format.get('channels')
encoding = input_format.get('encoding')
ignore_length = input_format.get('ignore_length', False)
if file_type not in VALID_FORMATS + [None]:
raise ValueError(
'Invalid file_type. Must be one of {}'.format(VALID_FORMATS)
)
if not is_number(rate) and rate is not None:
raise ValueError('rate must be a float or None')
if rate is not None and rate <= 0:
raise ValueError('rate must be a positive number')
if not isinstance(bits, int) and bits is not None:
raise ValueError('bits must be an int or None')
if bits is not None and bits <= 0:
raise ValueError('bits must be a positive number')
if not isinstance(channels, int) and channels is not None:
raise ValueError('channels must be an int or None')
if channels is not None and channels <= 0:
raise ValueError('channels must be a positive number')
if encoding not in ENCODING_VALS + [None]:
raise ValueError(
'Invalid encoding {}. Must be one of {}'.format(
encoding, ENCODING_VALS)
)
if not isinstance(ignore_length, bool):
raise ValueError('ignore_length must be a boolean')
def _input_format_args(self, input_format):
'''Private helper function for set_input_format
'''
self._validate_input_format(input_format)
file_type = input_format.get('file_type')
rate = input_format.get('rate')
bits = input_format.get('bits')
channels = input_format.get('channels')
encoding = input_format.get('encoding')
ignore_length = input_format.get('ignore_length', False)
input_format_args = []
if file_type is not None:
input_format_args.extend(['-t', '{}'.format(file_type)])
if rate is not None:
input_format_args.extend(['-r', '{:f}'.format(rate)])
if bits is not None:
input_format_args.extend(['-b', '{}'.format(bits)])
if channels is not None:
input_format_args.extend(['-c', '{}'.format(channels)])
if encoding is not None:
input_format_args.extend(['-e', '{}'.format(encoding)])
if ignore_length:
input_format_args.append('--ignore-length')
return input_format_args
[docs] def set_input_format(self,
file_type: Optional[str] = None,
rate: Optional[float] = None,
bits: Optional[int] = None,
channels: Optional[int] = None,
encoding: Optional[EncodingValue] = None,
ignore_length: bool = False):
'''Sets input file format arguments. This is primarily useful when
dealing with audio files without a file extension. Overwrites any
previously set input file arguments.
If this function is not explicity called the input format is inferred
from the file extension or the file's header.
Parameters
----------
file_type : str or None, default=None
The file type of the input audio file. Should be the same as what
the file extension would be, for ex. 'mp3' or 'wav'.
rate : float or None, default=None
The sample rate of the input audio file. If None the sample rate
is inferred.
bits : int or None, default=None
The number of bits per sample. If None, the number of bits per
sample is inferred.
channels : int or None, default=None
The number of channels in the audio file. If None the number of
channels is inferred.
encoding : str or None, default=None
The audio encoding type. Sometimes needed with file-types that
support more than one encoding type. One of:
* signed-integer : PCM data stored as signed (‘two’s
complement’) integers. Commonly used with a 16 or 24−bit
encoding size. A value of 0 represents minimum signal
power.
* unsigned-integer : PCM data stored as unsigned integers.
Commonly used with an 8-bit encoding size. A value of 0
represents maximum signal power.
* floating-point : PCM data stored as IEEE 753 single precision
(32-bit) or double precision (64-bit) floating-point
(‘real’) numbers. A value of 0 represents minimum signal
power.
* a-law : International telephony standard for logarithmic
encoding to 8 bits per sample. It has a precision
equivalent to roughly 13-bit PCM and is sometimes encoded
with reversed bit-ordering.
* u-law : North American telephony standard for logarithmic
encoding to 8 bits per sample. A.k.a. μ-law. It has a
precision equivalent to roughly 14-bit PCM and is sometimes
encoded with reversed bit-ordering.
* oki-adpcm : OKI (a.k.a. VOX, Dialogic, or Intel) 4-bit ADPCM;
it has a precision equivalent to roughly 12-bit PCM. ADPCM
is a form of audio compression that has a good compromise
between audio quality and encoding/decoding speed.
* ima-adpcm : IMA (a.k.a. DVI) 4-bit ADPCM; it has a precision
equivalent to roughly 13-bit PCM.
* ms-adpcm : Microsoft 4-bit ADPCM; it has a precision
equivalent to roughly 14-bit PCM.
* gsm-full-rate : GSM is currently used for the vast majority
of the world’s digital wireless telephone calls. It
utilises several audio formats with different bit-rates and
associated speech quality. SoX has support for GSM’s
original 13kbps ‘Full Rate’ audio format. It is usually
CPU-intensive to work with GSM audio.
ignore_length : bool, default=False
If True, overrides an (incorrect) audio length given in an audio
file’s header. If this option is given then SoX will keep reading
audio until it reaches the end of the input file.
'''
input_format = {
'file_type': file_type,
'rate': rate,
'bits': bits,
'channels': channels,
'encoding': encoding,
'ignore_length': ignore_length
}
self._validate_input_format(input_format)
self.input_format = input_format
def _validate_output_format(self, output_format):
'''Private helper function for validating input formats
'''
file_type = output_format.get('file_type')
rate = output_format.get('rate')
bits = output_format.get('bits')
channels = output_format.get('channels')
encoding = output_format.get('encoding')
comments = output_format.get('comments')
append_comments = output_format.get('append_comments', True)
if file_type not in VALID_FORMATS + [None]:
raise ValueError(
'Invalid file_type. Must be one of {}'.format(VALID_FORMATS)
)
if not is_number(rate) and rate is not None:
raise ValueError('rate must be a float or None')
if rate is not None and rate <= 0:
raise ValueError('rate must be a positive number')
if not isinstance(bits, int) and bits is not None:
raise ValueError('bits must be an int or None')
if bits is not None and bits <= 0:
raise ValueError('bits must be a positive number')
if not isinstance(channels, int) and channels is not None:
raise ValueError('channels must be an int or None')
if channels is not None and channels <= 0:
raise ValueError('channels must be a positive number')
if encoding not in ENCODING_VALS + [None]:
raise ValueError(
'Invalid encoding. Must be one of {}'.format(ENCODING_VALS)
)
if comments is not None and not isinstance(comments, str):
raise ValueError('comments must be a string or None')
if not isinstance(append_comments, bool):
raise ValueError('append_comments must be a boolean')
def _output_format_args(self, output_format):
'''Private helper function for set_output_format
'''
self._validate_output_format(output_format)
file_type = output_format.get('file_type')
rate = output_format.get('rate')
bits = output_format.get('bits')
channels = output_format.get('channels')
encoding = output_format.get('encoding')
comments = output_format.get('comments')
append_comments = output_format.get('append_comments', True)
output_format_args = []
if file_type is not None:
output_format_args.extend(['-t', '{}'.format(file_type)])
if rate is not None:
output_format_args.extend(['-r', '{:f}'.format(rate)])
if bits is not None:
output_format_args.extend(['-b', '{}'.format(bits)])
if channels is not None:
output_format_args.extend(['-c', '{}'.format(channels)])
if encoding is not None:
output_format_args.extend(['-e', '{}'.format(encoding)])
if comments is not None:
if append_comments:
output_format_args.extend(['--add-comment', comments])
else:
output_format_args.extend(['--comment', comments])
return output_format_args
[docs] def set_output_format(self,
file_type: Optional[str] = None,
rate: Optional[float] = None,
bits: Optional[int] = None,
channels: Optional[int] = None,
encoding: Optional[EncodingValue] = None,
comments: Optional[str] = None,
append_comments: bool = True):
'''Sets output file format arguments. These arguments will overwrite
any format related arguments supplied by other effects (e.g. rate).
If this function is not explicity called the output format is inferred
from the file extension or the file's header.
Parameters
----------
file_type : str or None, default=None
The file type of the output audio file. Should be the same as what
the file extension would be, for ex. 'mp3' or 'wav'.
rate : float or None, default=None
The sample rate of the output audio file. If None the sample rate
is inferred.
bits : int or None, default=None
The number of bits per sample. If None, the number of bits per
sample is inferred.
channels : int or None, default=None
The number of channels in the audio file. If None the number of
channels is inferred.
encoding : str or None, default=None
The audio encoding type. Sometimes needed with file-types that
support more than one encoding type. One of:
* signed-integer : PCM data stored as signed (‘two’s
complement’) integers. Commonly used with a 16 or 24−bit
encoding size. A value of 0 represents minimum signal
power.
* unsigned-integer : PCM data stored as unsigned integers.
Commonly used with an 8-bit encoding size. A value of 0
represents maximum signal power.
* floating-point : PCM data stored as IEEE 753 single precision
(32-bit) or double precision (64-bit) floating-point
(‘real’) numbers. A value of 0 represents minimum signal
power.
* a-law : International telephony standard for logarithmic
encoding to 8 bits per sample. It has a precision
equivalent to roughly 13-bit PCM and is sometimes encoded
with reversed bit-ordering.
* u-law : North American telephony standard for logarithmic
encoding to 8 bits per sample. A.k.a. μ-law. It has a
precision equivalent to roughly 14-bit PCM and is sometimes
encoded with reversed bit-ordering.
* oki-adpcm : OKI (a.k.a. VOX, Dialogic, or Intel) 4-bit ADPCM;
it has a precision equivalent to roughly 12-bit PCM. ADPCM
is a form of audio compression that has a good compromise
between audio quality and encoding/decoding speed.
* ima-adpcm : IMA (a.k.a. DVI) 4-bit ADPCM; it has a precision
equivalent to roughly 13-bit PCM.
* ms-adpcm : Microsoft 4-bit ADPCM; it has a precision
equivalent to roughly 14-bit PCM.
* gsm-full-rate : GSM is currently used for the vast majority
of the world’s digital wireless telephone calls. It
utilises several audio formats with different bit-rates and
associated speech quality. SoX has support for GSM’s
original 13kbps ‘Full Rate’ audio format. It is usually
CPU-intensive to work with GSM audio.
comments : str or None, default=None
If not None, the string is added as a comment in the header of the
output audio file. If None, no comments are added.
append_comments : bool, default=True
If True, comment strings are appended to SoX's default comments. If
False, the supplied comment replaces the existing comment.
'''
output_format = {
'file_type': file_type,
'rate': rate,
'bits': bits,
'channels': channels,
'encoding': encoding,
'comments': comments,
'append_comments': append_comments
}
self._validate_output_format(output_format)
self.output_format = output_format
[docs] def clear_effects(self):
'''Remove all effects processes.
'''
self.effects = list()
self.effects_log = list()
return self
def _parse_inputs(self, input_filepath, input_array, sample_rate_in):
'''Private helper function for parsing inputs to build and build_array
Parameters
----------
input_filepath : str or None
Either path to input audio file or None.
input_array : np.ndarray or None
A np.ndarray of an waveform with shape (n_samples, n_channels)
or None
sample_rate_in : int or None
Sample rate of input_array or None
Returns
-------
input_format : dict
Input format dictionary
input_filepath : str
Formatted input filepath.
'''
if input_filepath is not None and input_array is not None:
raise ValueError(
"Only one of input_filepath and input_array may be specified"
)
# set input parameters
if input_filepath is not None:
file_info.validate_input_file(input_filepath)
input_format = self.input_format
if input_format.get('channels') is None:
input_format['channels'] = file_info.channels(input_filepath)
elif input_array is not None:
if not isinstance(input_array, np.ndarray):
raise TypeError("input_array must be a numpy array or None")
if sample_rate_in is None:
raise ValueError(
"sample_rate_in must be specified for array inputs"
)
input_filepath = '-'
input_format = {
'file_type': ENCODINGS_MAPPING[input_array.dtype.type],
'rate': sample_rate_in,
'bits': None,
'channels': (
input_array.shape[-1] if len(input_array.shape) > 1 else 1
),
'encoding': None,
'ignore_length': False
}
else:
raise ValueError(
"One of input_filepath or input_array must be specified"
)
return input_format, input_filepath
[docs] def build(self,
input_filepath: Optional[Union[str, Path]] = None,
output_filepath: Optional[Union[str, Path]] = None,
input_array: Optional[str] = None,
sample_rate_in: Optional[float] = None,
extra_args: Optional[List[str]] = None,
return_output: bool = False):
'''Given an input file or array, creates an output_file on disk by
executing the current set of commands. This function returns True on
success. If return_output is True, this function returns a triple of
(status, out, err), giving the success state, along with stdout and
stderr returned by sox.
Parameters
----------
input_filepath : str or None
Either path to input audio file or None for array input.
output_filepath : str
Path to desired output file. If a file already exists at
the given path, the file will be overwritten.
If '-n', no file is created.
input_array : np.ndarray or None
An np.ndarray of an waveform with shape (n_samples, n_channels).
sample_rate_in must also be provided.
If None, input_filepath must be specified.
sample_rate_in : int
Sample rate of input_array.
This argument is ignored if input_array is None.
extra_args : list or None, default=None
If a list is given, these additional arguments are passed to SoX
at the end of the list of effects.
Don't use this argument unless you know exactly what you're doing!
return_output : bool, default=False
If True, returns the status and information sent to stderr and
stdout as a tuple (status, stdout, stderr).
If output_filepath is None, return_output=True by default.
If False, returns True on success.
Returns
-------
status : bool
True on success.
out : str (optional)
This is not returned unless return_output is True.
When returned, captures the stdout produced by sox.
err : str (optional)
This is not returned unless return_output is True.
When returned, captures the stderr produced by sox.
Examples
--------
>>> import numpy as np
>>> import sox
>>> tfm = sox.Transformer()
>>> sample_rate = 44100
>>> y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
file in, file out - basic usage
>>> status = tfm.build('path/to/input.wav', 'path/to/output.mp3')
file in, file out - equivalent usage
>>> status = tfm.build(
input_filepath='path/to/input.wav',
output_filepath='path/to/output.mp3'
)
array in, file out
>>> status = tfm.build(
input_array=y, sample_rate_in=sample_rate,
output_filepath='path/to/output.mp3'
)
'''
input_format, input_filepath = self._parse_inputs(
input_filepath, input_array, sample_rate_in
)
if output_filepath is None:
raise ValueError("output_filepath is not specified!")
# set output parameters
if input_filepath == output_filepath:
raise ValueError(
"input_filepath must be different from output_filepath."
)
file_info.validate_output_file(output_filepath)
args = []
args.extend(self.globals)
args.extend(self._input_format_args(input_format))
args.append(input_filepath)
args.extend(self._output_format_args(self.output_format))
args.append(output_filepath)
args.extend(self.effects)
if extra_args is not None:
if not isinstance(extra_args, list):
raise ValueError("extra_args must be a list.")
args.extend(extra_args)
status, out, err = sox(args, input_array, True)
if status != 0:
raise SoxError(
"Stdout: {}\nStderr: {}".format(out, err)
)
logger.info(
"Created %s with effects: %s",
output_filepath,
" ".join(self.effects_log)
)
if return_output:
return status, out, err
return True
[docs] def build_file(self,
input_filepath: Optional[Union[str, Path]] = None,
output_filepath: Optional[Union[str, Path]] = None,
input_array: Optional[np.ndarray] = None,
sample_rate_in: Optional[float] = None,
extra_args: Optional[List[str]] = None,
return_output: bool = False):
'''An alias for build.
Given an input file or array, creates an output_file on disk by
executing the current set of commands. This function returns True on
success. If return_output is True, this function returns a triple of
(status, out, err), giving the success state, along with stdout and
stderr returned by sox.
Parameters
----------
input_filepath : str or None
Either path to input audio file or None for array input.
output_filepath : str
Path to desired output file. If a file already exists at
the given path, the file will be overwritten.
If '-n', no file is created.
input_array : np.ndarray or None
An np.ndarray of an waveform with shape (n_samples, n_channels).
sample_rate_in must also be provided.
If None, input_filepath must be specified.
sample_rate_in : int
Sample rate of input_array.
This argument is ignored if input_array is None.
extra_args : list or None, default=None
If a list is given, these additional arguments are passed to SoX
at the end of the list of effects.
Don't use this argument unless you know exactly what you're doing!
return_output : bool, default=False
If True, returns the status and information sent to stderr and
stdout as a tuple (status, stdout, stderr).
If output_filepath is None, return_output=True by default.
If False, returns True on success.
Returns
-------
status : bool
True on success.
out : str (optional)
This is not returned unless return_output is True.
When returned, captures the stdout produced by sox.
err : str (optional)
This is not returned unless return_output is True.
When returned, captures the stderr produced by sox.
Examples
--------
>>> import numpy as np
>>> import sox
>>> tfm = sox.Transformer()
>>> sample_rate = 44100
>>> y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
file in, file out - basic usage
>>> status = tfm.build('path/to/input.wav', 'path/to/output.mp3')
file in, file out - equivalent usage
>>> status = tfm.build(
input_filepath='path/to/input.wav',
output_filepath='path/to/output.mp3'
)
array in, file out
>>> status = tfm.build(
input_array=y, sample_rate_in=sample_rate,
output_filepath='path/to/output.mp3'
)
'''
return self.build(
input_filepath, output_filepath, input_array, sample_rate_in,
extra_args, return_output
)
[docs] def build_array(self,
input_filepath: Optional[Union[str, Path]] = None,
input_array: Optional[np.ndarray] = None,
sample_rate_in: Optional[float] = None,
extra_args: Optional[List[str]] = None):
'''Given an input file or array, returns the ouput as a numpy array
by executing the current set of commands. By default the array will
have the same sample rate as the input file unless otherwise specified
using set_output_format. Functions such as rate, channels and convert
will be ignored!
Parameters
----------
input_filepath : str or None
Either path to input audio file or None.
input_array : np.ndarray or None
A np.ndarray of an waveform with shape (n_samples, n_channels).
If this argument is passed, sample_rate_in must also be provided.
If None, input_filepath must be specified.
sample_rate_in : int
Sample rate of input_array.
This argument is ignored if input_array is None.
extra_args : list or None, default=None
If a list is given, these additional arguments are passed to SoX
at the end of the list of effects.
Don't use this argument unless you know exactly what you're doing!
Returns
-------
output_array : np.ndarray
Output audio as a numpy array
Examples
--------
>>> import numpy as np
>>> import sox
>>> tfm = sox.Transformer()
>>> sample_rate = 44100
>>> y = np.sin(2 * np.pi * 440.0 * np.arange(sample_rate * 1.0) / sample_rate)
file in, array out
>>> output_array = tfm.build(input_filepath='path/to/input.wav')
array in, array out
>>> output_array = tfm.build(input_array=y, sample_rate_in=sample_rate)
specifying the output sample rate
>>> tfm.set_output_format(rate=8000)
>>> output_array = tfm.build(input_array=y, sample_rate_in=sample_rate)
if an effect changes the number of channels, you must explicitly
specify the number of output channels
>>> tfm.remix(remix_dictionary={1: [1], 2: [1], 3: [1]})
>>> tfm.set_output_format(channels=3)
>>> output_array = tfm.build(input_array=y, sample_rate_in=sample_rate)
'''
input_format, input_filepath = self._parse_inputs(
input_filepath, input_array, sample_rate_in
)
# check if any of the below commands are part of the effects chain
ignored_commands = ['rate', 'channels', 'convert']
if set(ignored_commands) & set(self.effects_log):
logger.warning(
"When outputting to an array, rate, channels and convert " +
"effects may be ignored. Use set_output_format() to " +
"specify output formats."
)
output_filepath = '-'
if input_format.get('file_type') is None:
encoding_out = np.int16
else:
encoding_out = [
k for k, v in ENCODINGS_MAPPING.items()
if input_format['file_type'] == v
][0]
n_bits = np.dtype(encoding_out).itemsize * 8
output_format = {
'file_type': 'raw',
'rate': sample_rate_in,
'bits': n_bits,
'channels': input_format['channels'],
'encoding': None,
'comments': None,
'append_comments': True,
}
if self.output_format.get('rate') is not None:
output_format['rate'] = self.output_format['rate']
if self.output_format.get('channels') is not None:
output_format['channels'] = self.output_format['channels']
if self.output_format.get('bits') is not None:
n_bits = self.output_format['bits']
output_format['bits'] = n_bits
if n_bits == 8:
encoding_out = np.int8
elif n_bits == 16:
encoding_out = np.int16
elif n_bits == 32:
encoding_out = np.float32
elif n_bits == 64:
encoding_out = np.float64
else:
raise ValueError("invalid n_bits {}".format(n_bits))
args = []
args.extend(self.globals)
args.extend(self._input_format_args(input_format))
args.append(input_filepath)
args.extend(self._output_format_args(output_format))
args.append(output_filepath)
args.extend(self.effects)
if extra_args is not None:
if not isinstance(extra_args, list):
raise ValueError("extra_args must be a list.")
args.extend(extra_args)
status, out, err = sox(args, input_array, False)
if status != 0:
raise SoxError(
"Stdout: {}\nStderr: {}".format(out, err)
)
out = np.frombuffer(out, dtype=encoding_out)
if output_format['channels'] > 1:
out = out.reshape(
(
output_format['channels'],
int(len(out) / output_format['channels'])
), order='F'
).T
logger.info(
"Created array with effects: %s",
" ".join(self.effects_log)
)
return out
[docs] def preview(self, input_filepath: Union[str, Path]):
'''Play a preview of the output with the current set of effects
Parameters
----------
input_filepath : str
Path to input audio file.
'''
args = ["play", "--no-show-progress"]
args.extend(self.globals)
args.extend(self.input_format)
args.append(input_filepath)
args.extend(self.effects)
play(args)
[docs] def allpass(self, frequency: float, width_q: float = 2.0):
'''Apply a two-pole all-pass filter. An all-pass filter changes the
audio’s frequency to phase relationship without changing its frequency
to amplitude relationship. The filter is described in detail in at
http://musicdsp.org/files/Audio-EQ-Cookbook.txt
Parameters
----------
frequency : float
The filter's center frequency in Hz.
width_q : float, default=2.0
The filter's width as a Q-factor.
See Also
--------
equalizer, highpass, lowpass, sinc
'''
if not is_number(frequency) or frequency <= 0:
raise ValueError("frequency must be a positive number.")
if not is_number(width_q) or width_q <= 0:
raise ValueError("width_q must be a positive number.")
effect_args = [
'allpass', '{:f}'.format(frequency), '{:f}q'.format(width_q)
]
self.effects.extend(effect_args)
self.effects_log.append('allpass')
return self
[docs] def bandpass(self, frequency: float, width_q: float = 2.0,
constant_skirt: bool = False):
'''Apply a two-pole Butterworth band-pass filter with the given central
frequency, and (3dB-point) band-width. The filter rolls off at 6dB per
octave (20dB per decade) and is described in detail in
http://musicdsp.org/files/Audio-EQ-Cookbook.txt
Parameters
----------
frequency : float
The filter's center frequency in Hz.
width_q : float, default=2.0
The filter's width as a Q-factor.
constant_skirt : bool, default=False
If True, selects constant skirt gain (peak gain = width_q).
If False, selects constant 0dB peak gain.
See Also
--------
bandreject, sinc
'''
if not is_number(frequency) or frequency <= 0:
raise ValueError("frequency must be a positive number.")
if not is_number(width_q) or width_q <= 0:
raise ValueError("width_q must be a positive number.")
if not isinstance(constant_skirt, bool):
raise ValueError("constant_skirt must be a boolean.")
effect_args = ['bandpass']
if constant_skirt:
effect_args.append('-c')
effect_args.extend(['{:f}'.format(frequency), '{:f}q'.format(width_q)])
self.effects.extend(effect_args)
self.effects_log.append('bandpass')
return self
[docs] def bandreject(self, frequency: float, width_q: float = 2.0,
constant_skirt: bool = False):
'''Apply a two-pole Butterworth band-reject filter with the given
central frequency, and (3dB-point) band-width. The filter rolls off at
6dB per octave (20dB per decade) and is described in detail in
http://musicdsp.org/files/Audio-EQ-Cookbook.txt
Parameters
----------
frequency : float
The filter's center frequency in Hz.
width_q : float, default=2.0
The filter's width as a Q-factor.
constant_skirt : bool, default=False
If True, selects constant skirt gain (peak gain = width_q).
If False, selects constant 0dB peak gain.
See Also
--------
bandreject, sinc
'''
if not is_number(frequency) or frequency <= 0:
raise ValueError("frequency must be a positive number.")
if not is_number(width_q) or width_q <= 0:
raise ValueError("width_q must be a positive number.")
if not isinstance(constant_skirt, bool):
raise ValueError("constant_skirt must be a boolean.")
effect_args = ['bandreject']
if constant_skirt:
effect_args.append('-c')
effect_args.extend(['{:f}'.format(frequency), '{:f}q'.format(width_q)])
self.effects.extend(effect_args)
self.effects_log.append('bandreject')
return self
[docs] def bass(self, gain_db: float,
frequency: float = 100.0,
slope: float = 0.5):
'''Boost or cut the bass (lower) frequencies of the audio using a
two-pole shelving filter with a response similar to that of a standard
hi-fi’s tone-controls. This is also known as shelving equalisation.
The filters are described in detail in
http://musicdsp.org/files/Audio-EQ-Cookbook.txt
Parameters
----------
gain_db : float
The gain at 0 Hz.
For a large cut use -20, for a large boost use 20.
frequency : float, default=100.0
The filter's cutoff frequency in Hz.
slope : float, default=0.5
The steepness of the filter's shelf transition.
For a gentle slope use 0.3, and use 1.0 for a steep slope.
See Also
--------
treble, equalizer
'''
if not is_number(gain_db):
raise ValueError("gain_db must be a number")
if not is_number(frequency) or frequency <= 0:
raise ValueError("frequency must be a positive number.")
if not is_number(slope) or slope <= 0 or slope > 1.0:
raise ValueError("width_q must be a positive number.")
effect_args = [
'bass', '{:f}'.format(gain_db), '{:f}'.format(frequency),
'{:f}s'.format(slope)
]
self.effects.extend(effect_args)
self.effects_log.append('bass')
return self
[docs] def bend(self,
n_bends: int,
start_times: List[float],
end_times: List[float],
cents: List[float],
frame_rate: int = 25,
oversample_rate: int = 16):
'''Changes pitch by specified amounts at specified times.
The pitch-bending algorithm utilises the Discrete Fourier Transform
(DFT) at a particular frame rate and over-sampling rate.
Parameters
----------
n_bends : int
The number of intervals to pitch shift
start_times : list of floats
A list of absolute start times (in seconds), in order
end_times : list of floats
A list of absolute end times (in seconds) in order.
[start_time, end_time] intervals may not overlap!
cents : list of floats
A list of pitch shifts in cents. A positive value shifts the pitch
up, a negative value shifts the pitch down.
frame_rate : int, default=25
The number of DFT frames to process per second, between 10 and 80
oversample_rate: int, default=16
The number of frames to over sample per second, between 4 and 32
See Also
--------
pitch
'''
if not isinstance(n_bends, int) or n_bends < 1:
raise ValueError("n_bends must be a positive integer.")
if not isinstance(start_times, list) or len(start_times) != n_bends:
raise ValueError("start_times must be a list of length n_bends.")
if any([(not is_number(p) or p <= 0) for p in start_times]):
raise ValueError("start_times must be positive floats.")
if sorted(start_times) != start_times:
raise ValueError("start_times must be in increasing order.")
if not isinstance(end_times, list) or len(end_times) != n_bends:
raise ValueError("end_times must be a list of length n_bends.")
if any([(not is_number(p) or p <= 0) for p in end_times]):
raise ValueError("end_times must be positive floats.")
if sorted(end_times) != end_times:
raise ValueError("end_times must be in increasing order.")
if any([e <= s for s, e in zip(start_times, end_times)]):
raise ValueError(
"end_times must be element-wise greater than start_times."
)
if any([e > s for s, e in zip(start_times[1:], end_times[:-1])]):
raise ValueError(
"[start_time, end_time] intervals must be non-overlapping."
)
if not isinstance(cents, list) or len(cents) != n_bends:
raise ValueError("cents must be a list of length n_bends.")
if any([not is_number(p) for p in cents]):
raise ValueError("elements of cents must be floats.")
if (not isinstance(frame_rate, int) or
frame_rate < 10 or frame_rate > 80):
raise ValueError("frame_rate must be an integer between 10 and 80")
if (not isinstance(oversample_rate, int) or
oversample_rate < 4 or oversample_rate > 32):
raise ValueError(
"oversample_rate must be an integer between 4 and 32."
)
effect_args = [
'bend',
'-f', '{}'.format(frame_rate),
'-o', '{}'.format(oversample_rate)
]
last = 0
for i in range(n_bends):
t_start = round(start_times[i] - last, 2)
t_end = round(end_times[i] - start_times[i], 2)
effect_args.append(
'{:f},{:f},{:f}'.format(t_start, cents[i], t_end)
)
last = end_times[i]
self.effects.extend(effect_args)
self.effects_log.append('bend')
return self
[docs] def biquad(self, b: List[float], a: List[float]):
'''Apply a biquad IIR filter with the given coefficients.
Parameters
----------
b : list of floats
Numerator coefficients. Must be length 3
a : list of floats
Denominator coefficients. Must be length 3
See Also
--------
fir, treble, bass, equalizer
'''
if not isinstance(b, list):
raise ValueError('b must be a list.')
if not isinstance(a, list):
raise ValueError('a must be a list.')
if len(b) != 3:
raise ValueError('b must be a length 3 list.')
if len(a) != 3:
raise ValueError('a must be a length 3 list.')
if not all([is_number(b_val) for b_val in b]):
raise ValueError('all elements of b must be numbers.')
if not all([is_number(a_val) for a_val in a]):
raise ValueError('all elements of a must be numbers.')
effect_args = [
'biquad', '{:f}'.format(b[0]), '{:f}'.format(b[1]),
'{:f}'.format(b[2]), '{:f}'.format(a[0]),
'{:f}'.format(a[1]), '{:f}'.format(a[2])
]
self.effects.extend(effect_args)
self.effects_log.append('biquad')
return self
[docs] def channels(self, n_channels: int):
'''Change the number of channels in the audio signal. If decreasing the
number of channels it mixes channels together, if increasing the number
of channels it duplicates.
Note: This overrides arguments used in the convert effect!
Parameters
----------
n_channels : int
Desired number of channels.
See Also
--------
convert
'''
if not isinstance(n_channels, int) or n_channels <= 0:
raise ValueError('n_channels must be a positive integer.')
effect_args = ['channels', '{}'.format(n_channels)]
self.effects.extend(effect_args)
self.effects_log.append('channels')
return self
[docs] def chorus(self,
gain_in: float = 0.5, gain_out: float = 0.9,
n_voices: int = 3,
delays: Optional[List[float]] = None,
decays: Optional[List[float]] = None,
speeds: Optional[List[float]] = None,
depths: Optional[List[float]] = None,
shapes: Optional[List[Literal['s', 't']]] = None):
'''Add a chorus effect to the audio. This can makeasingle vocal sound
like a chorus, but can also be applied to instrumentation.
Chorus resembles an echo effect with a short delay, but whereas with
echo the delay is constant, with chorus, it is varied using sinusoidal
or triangular modulation. The modulation depth defines the range the
modulated delay is played before or after the delay. Hence the delayed
sound will sound slower or faster, that is the delayed sound tuned
around the original one, like in a chorus where some vocals are
slightly off key.
Parameters
----------
gain_in : float, default=0.3
The time in seconds over which the instantaneous level of the input
signal is averaged to determine increases in volume.
gain_out : float, default=0.8
The time in seconds over which the instantaneous level of the input
signal is averaged to determine decreases in volume.
n_voices : int, default=3
The number of voices in the chorus effect.
delays : list of floats > 20 or None, default=None
If a list, the list of delays (in miliseconds) of length n_voices.
If None, the individual delay parameters are chosen automatically
to be between 40 and 60 miliseconds.
decays : list of floats or None, default=None
If a list, the list of decays (as a fraction of gain_in) of length
n_voices.
If None, the individual decay parameters are chosen automatically
to be between 0.3 and 0.4.
speeds : list of floats or None, default=None
If a list, the list of modulation speeds (in Hz) of length n_voices
If None, the individual speed parameters are chosen automatically
to be between 0.25 and 0.4 Hz.
depths : list of floats or None, default=None
If a list, the list of depths (in miliseconds) of length n_voices.
If None, the individual delay parameters are chosen automatically
to be between 1 and 3 miliseconds.
shapes : list of 's' or 't' or None, default=None
If a list, the list of modulation shapes - 's' for sinusoidal or
't' for triangular - of length n_voices.
If None, the individual shapes are chosen automatically.
'''
if not is_number(gain_in) or gain_in <= 0 or gain_in > 1:
raise ValueError("gain_in must be a number between 0 and 1.")
if not is_number(gain_out) or gain_out <= 0 or gain_out > 1:
raise ValueError("gain_out must be a number between 0 and 1.")
if not isinstance(n_voices, int) or n_voices <= 0:
raise ValueError("n_voices must be a positive integer.")
# validate delays
if not (delays is None or isinstance(delays, list)):
raise ValueError("delays must be a list or None")
if delays is not None:
if len(delays) != n_voices:
raise ValueError("the length of delays must equal n_voices")
if any((not is_number(p) or p < 20) for p in delays):
raise ValueError("the elements of delays must be numbers > 20")
else:
delays = [random.uniform(40, 60) for _ in range(n_voices)]
# validate decays
if not (decays is None or isinstance(decays, list)):
raise ValueError("decays must be a list or None")
if decays is not None:
if len(decays) != n_voices:
raise ValueError("the length of decays must equal n_voices")
if any((not is_number(p) or p <= 0 or p > 1) for p in decays):
raise ValueError(
"the elements of decays must be between 0 and 1"
)
else:
decays = [random.uniform(0.3, 0.4) for _ in range(n_voices)]
# validate speeds
if not (speeds is None or isinstance(speeds, list)):
raise ValueError("speeds must be a list or None")
if speeds is not None:
if len(speeds) != n_voices:
raise ValueError("the length of speeds must equal n_voices")
if any((not is_number(p) or p <= 0) for p in speeds):
raise ValueError("the elements of speeds must be numbers > 0")
else:
speeds = [random.uniform(0.25, 0.4) for _ in range(n_voices)]
# validate depths
if not (depths is None or isinstance(depths, list)):
raise ValueError("depths must be a list or None")
if depths is not None:
if len(depths) != n_voices:
raise ValueError("the length of depths must equal n_voices")
if any((not is_number(p) or p <= 0) for p in depths):
raise ValueError("the elements of depths must be numbers > 0")
else:
depths = [random.uniform(1.0, 3.0) for _ in range(n_voices)]
# validate shapes
if not (shapes is None or isinstance(shapes, list)):
raise ValueError("shapes must be a list or None")
if shapes is not None:
if len(shapes) != n_voices:
raise ValueError("the length of shapes must equal n_voices")
if any((p not in ['t', 's']) for p in shapes):
raise ValueError("the elements of shapes must be 's' or 't'")
else:
shapes = [random.choice(['t', 's']) for _ in range(n_voices)]
effect_args = ['chorus', '{}'.format(gain_in), '{}'.format(gain_out)]
for i in range(n_voices):
effect_args.extend([
'{:f}'.format(delays[i]),
'{:f}'.format(decays[i]),
'{:f}'.format(speeds[i]),
'{:f}'.format(depths[i]),
'-{}'.format(shapes[i])
])
self.effects.extend(effect_args)
self.effects_log.append('chorus')
return self
[docs] def compand(self,
attack_time: float = 0.3,
decay_time: float = 0.8,
soft_knee_db: float = 6.0,
tf_points: List[Tuple[float, float]] = [(-70, -70), (-60, -20), (0, 0)],
):
'''Compand (compress or expand) the dynamic range of the audio.
Parameters
----------
attack_time : float, default=0.3
The time in seconds over which the instantaneous level of the input
signal is averaged to determine increases in volume.
decay_time : float, default=0.8
The time in seconds over which the instantaneous level of the input
signal is averaged to determine decreases in volume.
soft_knee_db : float or None, default=6.0
The ammount (in dB) for which the points at where adjacent line
segments on the transfer function meet will be rounded.
If None, no soft_knee is applied.
tf_points : list of tuples
Transfer function points as a list of tuples corresponding to
points in (dB, dB) defining the compander's transfer function.
See Also
--------
mcompand, contrast
'''
if not is_number(attack_time) or attack_time <= 0:
raise ValueError("attack_time must be a positive number.")
if not is_number(decay_time) or decay_time <= 0:
raise ValueError("decay_time must be a positive number.")
if attack_time > decay_time:
logger.warning(
"attack_time is larger than decay_time.\n"
"For most situations, attack_time should be shorter than "
"decay time because the human ear is more sensitive to sudden "
"loud music than sudden soft music."
)
if not (is_number(soft_knee_db) or soft_knee_db is None):
raise ValueError("soft_knee_db must be a number or None.")
if not isinstance(tf_points, list):
raise TypeError("tf_points must be a list.")
if len(tf_points) == 0:
raise ValueError("tf_points must have at least one point.")
if any(not isinstance(pair, tuple) for pair in tf_points):
raise ValueError("elements of tf_points must be pairs")
if any(len(pair) != 2 for pair in tf_points):
raise ValueError("Tuples in tf_points must be length 2")
if any(not (is_number(p[0]) and is_number(p[1])) for p in tf_points):
raise ValueError("Tuples in tf_points must be pairs of numbers.")
if any((p[0] > 0 or p[1] > 0) for p in tf_points):
raise ValueError("Tuple values in tf_points must be <= 0 (dB).")
if len(tf_points) > len(set([p[0] for p in tf_points])):
raise ValueError("Found duplicate x-value in tf_points.")
tf_points = sorted(
tf_points,
key=lambda tf_points: tf_points[0]
)
transfer_list = []
for point in tf_points:
transfer_list.extend([
"{:f}".format(point[0]), "{:f}".format(point[1])
])
effect_args = [
'compand',
"{:f},{:f}".format(attack_time, decay_time)
]
if soft_knee_db is not None:
effect_args.append(
"{:f}:{}".format(soft_knee_db, ",".join(transfer_list))
)
else:
effect_args.append(",".join(transfer_list))
self.effects.extend(effect_args)
self.effects_log.append('compand')
return self
[docs] def contrast(self, amount=75):
'''Comparable with compression, this effect modifies an audio signal to
make it sound louder.
Parameters
----------
amount : float
Amount of enhancement between 0 and 100.
See Also
--------
compand, mcompand
'''
if not is_number(amount) or amount < 0 or amount > 100:
raise ValueError('amount must be a number between 0 and 100.')
effect_args = ['contrast', '{:f}'.format(amount)]
self.effects.extend(effect_args)
self.effects_log.append('contrast')
return self
[docs] def convert(self,
samplerate: Optional[float] = None,
n_channels: Optional[int] = None,
bitdepth: Optional[int] = None):
'''Converts output audio to the specified format.
Parameters
----------
samplerate : float, default=None
Desired samplerate. If None, defaults to the same as input.
n_channels : int, default=None
Desired number of channels. If None, defaults to the same as input.
bitdepth : int, default=None
Desired bitdepth. If None, defaults to the same as input.
See Also
--------
rate
'''
bitdepths = [8, 16, 24, 32, 64]
if bitdepth is not None:
if bitdepth not in bitdepths:
raise ValueError(
"bitdepth must be one of {}.".format(str(bitdepths))
)
self.output_format['bits'] = bitdepth
if n_channels is not None:
if not isinstance(n_channels, int) or n_channels <= 0:
raise ValueError(
"n_channels must be a positive integer."
)
self.output_format['channels'] = n_channels
if samplerate is not None:
if not is_number(samplerate) or samplerate <= 0:
raise ValueError("samplerate must be a positive number.")
self.rate(samplerate)
return self
[docs] def dcshift(self, shift: float = 0.0):
'''Apply a DC shift to the audio.
Parameters
----------
shift : float
Amount to shift audio between -2 and 2. (Audio is between -1 and 1)
See Also
--------
highpass
'''
if not is_number(shift) or shift < -2 or shift > 2:
raise ValueError('shift must be a number between -2 and 2.')
effect_args = ['dcshift', '{:f}'.format(shift)]
self.effects.extend(effect_args)
self.effects_log.append('dcshift')
return self
[docs] def deemph(self):
'''Apply Compact Disc (IEC 60908) de-emphasis (a treble attenuation
shelving filter). Pre-emphasis was applied in the mastering of some
CDs issued in the early 1980s. These included many classical music
albums, as well as now sought-after issues of albums by The Beatles,
Pink Floyd and others. Pre-emphasis should be removed at playback time
by a de-emphasis filter in the playback device. However, not all modern
CD players have this filter, and very few PC CD drives have it; playing
pre-emphasised audio without the correct de-emphasis filter results in
audio that sounds harsh and is far from what its creators intended.
The de-emphasis filter is implemented as a biquad and requires the
input audio sample rate to be either 44.1kHz or 48kHz. Maximum
deviation from the ideal response is only 0.06dB (up to 20kHz).
See Also
--------
bass, treble
'''
effect_args = ['deemph']
self.effects.extend(effect_args)
self.effects_log.append('deemph')
return self
[docs] def delay(self, positions: List[float]):
'''Delay one or more audio channels such that they start at the given
positions.
Parameters
----------
positions: list of floats
List of times (in seconds) to delay each audio channel.
If fewer positions are given than the number of channels, the
remaining channels will be unaffected.
'''
if not isinstance(positions, list):
raise ValueError("positions must be a a list of numbers")
if not all((is_number(p) and p >= 0) for p in positions):
raise ValueError("positions must be positive nubmers")
effect_args = ['delay']
effect_args.extend(['{:f}'.format(p) for p in positions])
self.effects.extend(effect_args)
self.effects_log.append('delay')
return self
[docs] def downsample(self, factor: int = 2):
'''Downsample the signal by an integer factor. Only the first out of
each factor samples is retained, the others are discarded.
No decimation filter is applied. If the input is not a properly
bandlimited baseband signal, aliasing will occur. This may be desirable
e.g., for frequency translation.
For a general resampling effect with anti-aliasing, see rate.
Parameters
----------
factor : int, default=2
Downsampling factor.
See Also
--------
rate, upsample
'''
if not isinstance(factor, int) or factor < 1:
raise ValueError('factor must be a positive integer.')
effect_args = ['downsample', '{}'.format(factor)]
self.effects.extend(effect_args)
self.effects_log.append('downsample')
return self
[docs] def earwax(self):
'''Makes audio easier to listen to on headphones. Adds ‘cues’ to 44.1kHz
stereo audio so that when listened to on headphones the stereo image is
moved from inside your head (standard for headphones) to outside and in
front of the listener (standard for speakers).
Warning: Will only work properly on 44.1kHz stereo audio!
'''
effect_args = ['earwax']
self.effects.extend(effect_args)
self.effects_log.append('earwax')
return self
[docs] def echo(self,
gain_in: float = 0.8,
gain_out: float = 0.9,
n_echos: int = 1,
delays: List[float] = [60],
decays: List[float] = [0.4]):
'''Add echoing to the audio.
Echoes are reflected sound and can occur naturally amongst mountains
(and sometimes large buildings) when talking or shouting; digital echo
effects emulate this behav- iour and are often used to help fill out
the sound of a single instrument or vocal. The time differ- ence
between the original signal and the reflection is the 'delay' (time),
and the loudness of the reflected signal is the 'decay'. Multiple
echoes can have different delays and decays.
Parameters
----------
gain_in : float, default=0.8
Input volume, between 0 and 1
gain_out : float, default=0.9
Output volume, between 0 and 1
n_echos : int, default=1
Number of reflections
delays : list, default=[60]
List of delays in miliseconds
decays : list, default=[0.4]
List of decays, relative to gain in between 0 and 1
See Also
--------
echos, reverb, chorus
'''
if not is_number(gain_in) or gain_in <= 0 or gain_in > 1:
raise ValueError("gain_in must be a number between 0 and 1.")
if not is_number(gain_out) or gain_out <= 0 or gain_out > 1:
raise ValueError("gain_out must be a number between 0 and 1.")
if not isinstance(n_echos, int) or n_echos <= 0:
raise ValueError("n_echos must be a positive integer.")
# validate delays
if not isinstance(delays, list):
raise ValueError("delays must be a list")
if len(delays) != n_echos:
raise ValueError("the length of delays must equal n_echos")
if any((not is_number(p) or p <= 0) for p in delays):
raise ValueError("the elements of delays must be numbers > 0")
# validate decays
if not isinstance(decays, list):
raise ValueError("decays must be a list")
if len(decays) != n_echos:
raise ValueError("the length of decays must equal n_echos")
if any((not is_number(p) or p <= 0 or p > 1) for p in decays):
raise ValueError(
"the elements of decays must be between 0 and 1"
)
effect_args = ['echo', '{:f}'.format(gain_in), '{:f}'.format(gain_out)]
for i in range(n_echos):
effect_args.extend([
'{}'.format(delays[i]),
'{}'.format(decays[i])
])
self.effects.extend(effect_args)
self.effects_log.append('echo')
return self
[docs] def echos(self,
gain_in: float = 0.8,
gain_out: float = 0.9,
n_echos: int = 1,
delays: List[float] = [60],
decays: List[float] = [0.4]):
'''Add a sequence of echoes to the audio.
Like the echo effect, echos stand for ‘ECHO in Sequel’, that is the
first echos takes the input, the second the input and the first echos,
the third the input and the first and the second echos, ... and so on.
Care should be taken using many echos; a single echos has the same
effect as a single echo.
Parameters
----------
gain_in : float, default=0.8
Input volume, between 0 and 1
gain_out : float, default=0.9
Output volume, between 0 and 1
n_echos : int, default=1
Number of reflections
delays : list, default=[60]
List of delays in miliseconds
decays : list, default=[0.4]
List of decays, relative to gain in between 0 and 1
See Also
--------
echo, reverb, chorus
'''
if not is_number(gain_in) or gain_in <= 0 or gain_in > 1:
raise ValueError("gain_in must be a number between 0 and 1.")
if not is_number(gain_out) or gain_out <= 0 or gain_out > 1:
raise ValueError("gain_out must be a number between 0 and 1.")
if not isinstance(n_echos, int) or n_echos <= 0:
raise ValueError("n_echos must be a positive integer.")
# validate delays
if not isinstance(delays, list):
raise ValueError("the delays must be a list ")
if len(delays) != n_echos:
raise ValueError("the length of delays must equal n_echos")
if any((not is_number(p) or p <= 0) for p in delays):
raise ValueError("the elements of delays must be numbers > 0")
# validate decays
if not isinstance(decays, list):
raise ValueError("the decays must be a list ")
if len(decays) != n_echos:
raise ValueError("the length of decays must equal n_echos")
if any((not is_number(p) or p <= 0 or p > 1) for p in decays):
raise ValueError(
"the elements of decays must be between 0 and 1"
)
effect_args = [
'echos', '{:f}'.format(gain_in), '{:f}'.format(gain_out)
]
for i in range(n_echos):
effect_args.extend([
'{:f}'.format(delays[i]),
'{:f}'.format(decays[i])
])
self.effects.extend(effect_args)
self.effects_log.append('echos')
return self
[docs] def equalizer(self,
frequency: float,
width_q: float,
gain_db: float):
'''Apply a two-pole peaking equalisation (EQ) filter to boost or
reduce around a given frequency.
This effect can be applied multiple times to produce complex EQ curves.
Parameters
----------
frequency : float
The filter's central frequency in Hz.
width_q : float
The filter's width as a Q-factor.
gain_db : float
The filter's gain in dB.
See Also
--------
bass, treble
'''
if not is_number(frequency) or frequency <= 0:
raise ValueError("frequency must be a positive number.")
if not is_number(width_q) or width_q <= 0:
raise ValueError("width_q must be a positive number.")
if not is_number(gain_db):
raise ValueError("gain_db must be a number.")
effect_args = [
'equalizer',
'{:f}'.format(frequency),
'{:f}q'.format(width_q),
'{:f}'.format(gain_db)
]
self.effects.extend(effect_args)
self.effects_log.append('equalizer')
return self
[docs] def fade(self, fade_in_len: float = 0.0,
fade_out_len: float = 0.0,
fade_shape: Literal['q', 'h', 't', 'l', 'p'] = 'q'):
'''Add a fade in and/or fade out to an audio file.
Default fade shape is 1/4 sine wave.
Parameters
----------
fade_in_len : float, default=0.0
Length of fade-in (seconds). If fade_in_len = 0,
no fade in is applied.
fade_out_len : float, defaut=0.0
Length of fade-out (seconds). If fade_out_len = 0,
no fade in is applied.
fade_shape : str, default='q'
Shape of fade. Must be one of
* 'q' for quarter sine (default),
* 'h' for half sine,
* 't' for linear,
* 'l' for logarithmic
* 'p' for inverted parabola.
See Also
--------
splice
'''
fade_shapes = ['q', 'h', 't', 'l', 'p']
if fade_shape not in fade_shapes:
raise ValueError(
"Fade shape must be one of {}".format(" ".join(fade_shapes))
)
if not is_number(fade_in_len) or fade_in_len < 0:
raise ValueError("fade_in_len must be a nonnegative number.")
if not is_number(fade_out_len) or fade_out_len < 0:
raise ValueError("fade_out_len must be a nonnegative number.")
effect_args = []
if fade_in_len > 0:
effect_args.extend([
'fade', '{}'.format(fade_shape), '{:f}'.format(fade_in_len)
])
if fade_out_len > 0:
effect_args.extend([
'reverse', 'fade', '{}'.format(fade_shape),
'{:f}'.format(fade_out_len), 'reverse'
])
if len(effect_args) > 0:
self.effects.extend(effect_args)
self.effects_log.append('fade')
return self
[docs] def fir(self, coefficients: List[float]):
'''Use SoX’s FFT convolution engine with given FIR filter coefficients.
Parameters
----------
coefficients : list
fir filter coefficients
'''
if not isinstance(coefficients, list):
raise ValueError("coefficients must be a list")
if not all([is_number(c) for c in coefficients]):
raise ValueError("coefficients must be numbers.")
effect_args = ['fir']
effect_args.extend(['{:f}'.format(c) for c in coefficients])
self.effects.extend(effect_args)
self.effects_log.append('fir')
return self
[docs] def flanger(self,
delay: float = 0, depth: float = 2,
regen: float = 0, width: float = 71, speed: float = 0.5,
shape: Literal['sine', 'triangle'] = 'sine',
phase: float = 25,
interp: Literal['linear', 'quadratic'] = 'linear'):
'''Apply a flanging effect to the audio.
Parameters
----------
delay : float, default=0
Base delay (in miliseconds) between 0 and 30.
depth : float, default=2
Added swept delay (in miliseconds) between 0 and 10.
regen : float, default=0
Percentage regeneration between -95 and 95.
width : float, default=71,
Percentage of delayed signal mixed with original between 0 and 100.
speed : float, default=0.5
Sweeps per second (in Hz) between 0.1 and 10.
shape : 'sine' or 'triangle', default='sine'
Swept wave shape
phase : float, default=25
Swept wave percentage phase-shift for multi-channel flange between
0 and 100. 0 = 100 = same phase on each channel
interp : 'linear' or 'quadratic', default='linear'
Digital delay-line interpolation type.
See Also
--------
tremolo
'''
if not is_number(delay) or delay < 0 or delay > 30:
raise ValueError("delay must be a number between 0 and 30.")
if not is_number(depth) or depth < 0 or depth > 10:
raise ValueError("depth must be a number between 0 and 10.")
if not is_number(regen) or regen < -95 or regen > 95:
raise ValueError("regen must be a number between -95 and 95.")
if not is_number(width) or width < 0 or width > 100:
raise ValueError("width must be a number between 0 and 100.")
if not is_number(speed) or speed < 0.1 or speed > 10:
raise ValueError("speed must be a number between 0.1 and 10.")
if shape not in ['sine', 'triangle']:
raise ValueError("shape must be one of 'sine' or 'triangle'.")
if not is_number(phase) or phase < 0 or phase > 100:
raise ValueError("phase must be a number between 0 and 100.")
if interp not in ['linear', 'quadratic']:
raise ValueError("interp must be one of 'linear' or 'quadratic'.")
effect_args = [
'flanger',
'{:f}'.format(delay),
'{:f}'.format(depth),
'{:f}'.format(regen),
'{:f}'.format(width),
'{:f}'.format(speed),
'{}'.format(shape),
'{:f}'.format(phase),
'{}'.format(interp)
]
self.effects.extend(effect_args)
self.effects_log.append('flanger')
return self
[docs] def gain(self,
gain_db: float = 0.0,
normalize: bool = True,
limiter: bool = False,
balance: Optional[Literal['e', 'B', 'b']] = None):
'''Apply amplification or attenuation to the audio signal.
Parameters
----------
gain_db : float, default=0.0
Gain adjustment in decibels (dB).
normalize : bool, default=True
If True, audio is normalized to gain_db relative to full scale.
If False, simply adjusts the audio power level by gain_db.
limiter : bool, default=False
If True, a simple limiter is invoked to prevent clipping.
balance : str or None, default=None
Balance gain across channels. Can be one of:
* None applies no balancing (default)
* 'e' applies gain to all channels other than that with the
highest peak level, such that all channels attain the same
peak level
* 'B' applies gain to all channels other than that with the
highest RMS level, such that all channels attain the same
RMS level
* 'b' applies gain with clipping protection to all channels other
than that with the highest RMS level, such that all channels
attain the same RMS level
If normalize=True, 'B' and 'b' are equivalent.
See Also
--------
loudness
'''
if not is_number(gain_db):
raise ValueError("gain_db must be a number.")
if not isinstance(normalize, bool):
raise ValueError("normalize must be a boolean.")
if not isinstance(limiter, bool):
raise ValueError("limiter must be a boolean.")
if balance not in [None, 'e', 'B', 'b']:
raise ValueError("balance must be one of None, 'e', 'B', or 'b'.")
effect_args = ['gain']
if balance is not None:
effect_args.append('-{}'.format(balance))
if normalize:
effect_args.append('-n')
if limiter:
effect_args.append('-l')
effect_args.append('{:f}'.format(gain_db))
self.effects.extend(effect_args)
self.effects_log.append('gain')
return self
[docs] def highpass(self,
frequency: float,
width_q: float = 0.707,
n_poles: int = 2):
'''Apply a high-pass filter with 3dB point frequency. The filter can be
either single-pole or double-pole. The filters roll off at 6dB per pole
per octave (20dB per pole per decade).
Parameters
----------
frequency : float
The filter's cutoff frequency in Hz.
width_q : float, default=0.707
The filter's width as a Q-factor. Applies only when n_poles=2.
The default gives a Butterworth response.
n_poles : int, default=2
The number of poles in the filter. Must be either 1 or 2
See Also
--------
lowpass, equalizer, sinc, allpass
'''
if not is_number(frequency) or frequency <= 0:
raise ValueError("frequency must be a positive number.")
if not is_number(width_q) or width_q <= 0:
raise ValueError("width_q must be a positive number.")
if n_poles not in [1, 2]:
raise ValueError("n_poles must be 1 or 2.")
effect_args = [
'highpass', '-{}'.format(n_poles), '{:f}'.format(frequency)
]
if n_poles == 2:
effect_args.append('{:f}q'.format(width_q))
self.effects.extend(effect_args)
self.effects_log.append('highpass')
return self
[docs] def lowpass(self,
frequency: float,
width_q: float = 0.707,
n_poles: int = 2):
'''Apply a low-pass filter with 3dB point frequency. The filter can be
either single-pole or double-pole. The filters roll off at 6dB per pole
per octave (20dB per pole per decade).
Parameters
----------
frequency : float
The filter's cutoff frequency in Hz.
width_q : float, default=0.707
The filter's width as a Q-factor. Applies only when n_poles=2.
The default gives a Butterworth response.
n_poles : int, default=2
The number of poles in the filter. Must be either 1 or 2
See Also
--------
highpass, equalizer, sinc, allpass
'''
if not is_number(frequency) or frequency <= 0:
raise ValueError("frequency must be a positive number.")
if not is_number(width_q) or width_q <= 0:
raise ValueError("width_q must be a positive number.")
if n_poles not in [1, 2]:
raise ValueError("n_poles must be 1 or 2.")
effect_args = [
'lowpass', '-{}'.format(n_poles), '{:f}'.format(frequency)
]
if n_poles == 2:
effect_args.append('{:f}q'.format(width_q))
self.effects.extend(effect_args)
self.effects_log.append('lowpass')
return self
[docs] def hilbert(self, num_taps: Optional[int] = None):
'''Apply an odd-tap Hilbert transform filter, phase-shifting the signal
by 90 degrees. This is used in many matrix coding schemes and for
analytic signal generation. The process is often written as a
multiplication by i (or j), the imaginary unit. An odd-tap Hilbert
transform filter has a bandpass characteristic, attenuating the lowest
and highest frequencies.
Parameters
----------
num_taps : int or None, default=None
Number of filter taps - must be odd. If none, it is chosen to have
a cutoff frequency of about 75 Hz.
'''
if num_taps is not None and not isinstance(num_taps, int):
raise ValueError("num taps must be None or an odd integer.")
if num_taps is not None and num_taps % 2 == 0:
raise ValueError("num_taps must an odd integer.")
effect_args = ['hilbert']
if num_taps is not None:
effect_args.extend(['-n', '{}'.format(num_taps)])
self.effects.extend(effect_args)
self.effects_log.append('hilbert')
return self
[docs] def loudness(self, gain_db: float = -10.0, reference_level: float = 65.0):
'''Loudness control. Similar to the gain effect, but provides
equalisation for the human auditory system.
The gain is adjusted by gain_db and the signal is equalised according
to ISO 226 w.r.t. reference_level.
Parameters
----------
gain_db : float, default=-10.0
Loudness adjustment amount (in dB)
reference_level : float, default=65.0
Reference level (in dB) according to which the signal is equalized.
Must be between 50 and 75 (dB)
See Also
--------
gain
'''
if not is_number(gain_db):
raise ValueError('gain_db must be a number.')
if not is_number(reference_level):
raise ValueError('reference_level must be a number')
if reference_level > 75 or reference_level < 50:
raise ValueError('reference_level must be between 50 and 75')
effect_args = [
'loudness',
'{:f}'.format(gain_db),
'{:f}'.format(reference_level)
]
self.effects.extend(effect_args)
self.effects_log.append('loudness')
return self
[docs] def mcompand(self,
n_bands: int = 2,
crossover_frequencies: List[float] = [1600],
attack_time: List[float] = [0.005, 0.000625],
decay_time: List[float] = [0.1, 0.0125],
soft_knee_db: List[Optional[float]] = [6.0, None],
tf_points: List[List[Tuple[float, float]]] = [
[(-47, -40), (-34, -34), (-17, -33), (0, 0)],
[(-47, -40), (-34, -34), (-15, -33), (0, 0)]
],
gain: List[Optional[float]] = [None, None]):
'''The multi-band compander is similar to the single-band compander but
the audio is first divided into bands using Linkwitz-Riley cross-over
filters and a separately specifiable compander run on each band.
When used with n_bands=1, this effect is identical to compand.
When using n_bands > 1, the first set of arguments applies a single
band compander, and each subsequent set of arugments is applied on
each of the crossover frequencies.
Parameters
----------
n_bands : int, default=2
The number of bands.
crossover_frequencies : list of float, default=[1600]
A list of crossover frequencies in Hz of length n_bands-1.
The first band is always the full spectrum, followed by the bands
specified by crossover_frequencies.
attack_time : list of float, default=[0.005, 0.000625]
A list of length n_bands, where each element is the time in seconds
over which the instantaneous level of the input signal is averaged
to determine increases in volume over the current band.
decay_time : list of float, default=[0.1, 0.0125]
A list of length n_bands, where each element is the time in seconds
over which the instantaneous level of the input signal is averaged
to determine decreases in volume over the current band.
soft_knee_db : list of float or None, default=[6.0, None]
A list of length n_bands, where each element is the ammount (in dB)
for which the points at where adjacent line segments on the
transfer function meet will be rounded over the current band.
If None, no soft_knee is applied.
tf_points : list of list of tuples, default=[
[(-47, -40), (-34, -34), (-17, -33), (0, 0)],
[(-47, -40), (-34, -34), (-15, -33), (0, 0)]]
A list of length n_bands, where each element is the transfer
function points as a list of tuples corresponding to points in
(dB, dB) defining the compander's transfer function over the
current band.
gain : list of floats or None
A list of gain values for each frequency band.
If None, no gain is applied.
See Also
--------
compand, contrast
'''
if not isinstance(n_bands, int) or n_bands < 1:
raise ValueError("n_bands must be a positive integer.")
if (not isinstance(crossover_frequencies, list) or
len(crossover_frequencies) != n_bands - 1):
raise ValueError(
"crossover_frequences must be a list of length n_bands - 1"
)
if any([not is_number(f) or f < 0 for f in crossover_frequencies]):
raise ValueError(
"crossover_frequencies elements must be positive floats."
)
if not isinstance(attack_time, list) or len(attack_time) != n_bands:
raise ValueError("attack_time must be a list of length n_bands")
if any([not is_number(a) or a <= 0 for a in attack_time]):
raise ValueError("attack_time elements must be positive numbers.")
if not isinstance(decay_time, list) or len(decay_time) != n_bands:
raise ValueError("decay_time must be a list of length n_bands")
if any([not is_number(d) or d <= 0 for d in decay_time]):
raise ValueError("decay_time elements must be positive numbers.")
if any([a > d for a, d in zip(attack_time, decay_time)]):
logger.warning(
"Elements of attack_time are larger than decay_time.\n"
"For most situations, attack_time should be shorter than "
"decay time because the human ear is more sensitive to sudden "
"loud music than sudden soft music."
)
if not isinstance(soft_knee_db, list) or len(soft_knee_db) != n_bands:
raise ValueError("soft_knee_db must be a list of length n_bands.")
if any([(not is_number(d) and d is not None) for d in soft_knee_db]):
raise ValueError(
"elements of soft_knee_db must be a number or None."
)
if not isinstance(tf_points, list) or len(tf_points) != n_bands:
raise ValueError("tf_points must be a list of length n_bands.")
if any([not isinstance(t, list) or len(t) == 0 for t in tf_points]):
raise ValueError(
"tf_points must be a list with at least one point."
)
for tfp in tf_points:
if any(not isinstance(pair, tuple) for pair in tfp):
raise ValueError("elements of tf_points lists must be pairs")
if any(len(pair) != 2 for pair in tfp):
raise ValueError("Tuples in tf_points lists must be length 2")
if any(not (is_number(p[0]) and is_number(p[1])) for p in tfp):
raise ValueError(
"Tuples in tf_points lists must be pairs of numbers."
)
if any((p[0] > 0 or p[1] > 0) for p in tfp):
raise ValueError(
"Tuple values in tf_points lists must be <= 0 (dB)."
)
if len(tfp) > len(set([p[0] for p in tfp])):
raise ValueError("Found duplicate x-value in tf_points list.")
if not isinstance(gain, list) or len(gain) != n_bands:
raise ValueError("gain must be a list of length n_bands")
if any([not (is_number(g) or g is None) for g in gain]):
raise ValueError("gain elements must be numbers or None.")
effect_args = ['mcompand']
for i in range(n_bands):
if i > 0:
effect_args.append('{:f}'.format(crossover_frequencies[i - 1]))
intermed_args = ["{:f},{:f}".format(attack_time[i], decay_time[i])]
tf_points_band = tf_points[i]
tf_points_band = sorted(
tf_points_band,
key=lambda tf_points_band: tf_points_band[0]
)
transfer_list = []
for point in tf_points_band:
transfer_list.extend([
"{:f}".format(point[0]), "{:f}".format(point[1])
])
if soft_knee_db[i] is not None:
intermed_args.append(
"{:f}:{}".format(soft_knee_db[i], ",".join(transfer_list))
)
else:
intermed_args.append(",".join(transfer_list))
if gain[i] is not None:
intermed_args.append("{:f}".format(gain[i]))
effect_args.append(' '.join(intermed_args))
self.effects.extend(effect_args)
self.effects_log.append('mcompand')
return self
[docs] def noiseprof(self,
input_filepath: Union[str, Path],
profile_path: Union[str, Path]):
'''Calculate a profile of the audio for use in noise reduction.
Running this command does not effect the Transformer effects
chain. When this function is called, the calculated noise profile
file is saved to the `profile_path`.
Parameters
----------
input_filepath : str
Path to audiofile from which to compute a noise profile.
profile_path : str
Path to save the noise profile file.
See Also
--------
noisered
'''
if os.path.isdir(profile_path):
raise ValueError(
"profile_path {} is a directory.".format(profile_path))
if os.path.dirname(profile_path) == '' and profile_path != '':
_abs_profile_path = os.path.join(os.getcwd(), profile_path)
else:
_abs_profile_path = profile_path
if not os.access(os.path.dirname(_abs_profile_path), os.W_OK):
raise IOError(
"profile_path {} is not writeable.".format(_abs_profile_path))
effect_args = ['noiseprof', profile_path]
self.build(input_filepath, '-n', extra_args=effect_args)
return None
[docs] def noisered(self, profile_path: Union[str, Path], amount: float = 0.5):
'''Reduce noise in the audio signal by profiling and filtering.
This effect is moderately effective at removing consistent
background noise such as hiss or hum.
Parameters
----------
profile_path : str
Path to a noise profile file.
This file can be generated using the `noiseprof` effect.
amount : float, default=0.5
How much noise should be removed is specified by amount. Should
be between 0 and 1. Higher numbers will remove more noise but
present a greater likelihood of removing wanted components of
the audio signal.
See Also
--------
noiseprof
'''
if not os.path.exists(profile_path):
raise IOError(
"profile_path {} does not exist.".format(profile_path))
if not is_number(amount) or amount < 0 or amount > 1:
raise ValueError("amount must be a number between 0 and 1.")
effect_args = [
'noisered',
profile_path,
'{:f}'.format(amount)
]
self.effects.extend(effect_args)
self.effects_log.append('noisered')
return self
[docs] def norm(self, db_level: float = -3.0):
'''Normalize an audio file to a particular db level.
This behaves identically to the gain effect with normalize=True.
Parameters
----------
db_level : float, default=-3.0
Output volume (db)
See Also
--------
gain, loudness
'''
if not is_number(db_level):
raise ValueError('db_level must be a number.')
effect_args = [
'norm',
'{:f}'.format(db_level)
]
self.effects.extend(effect_args)
self.effects_log.append('norm')
return self
[docs] def oops(self):
'''Out Of Phase Stereo effect. Mixes stereo to twin-mono where each
mono channel contains the difference between the left and right stereo
channels. This is sometimes known as the 'karaoke' effect as it often
has the effect of removing most or all of the vocals from a recording.
'''
effect_args = ['oops']
self.effects.extend(effect_args)
self.effects_log.append('oops')
return self
[docs] def overdrive(self, gain_db: float = 20.0, colour: float = 20.0):
'''Apply non-linear distortion.
Parameters
----------
gain_db : float, default=20
Controls the amount of distortion (dB).
colour : float, default=20
Controls the amount of even harmonic content in the output (dB).
'''
if not is_number(gain_db):
raise ValueError('db_level must be a number.')
if not is_number(colour):
raise ValueError('colour must be a number.')
effect_args = [
'overdrive',
'{:f}'.format(gain_db),
'{:f}'.format(colour)
]
self.effects.extend(effect_args)
self.effects_log.append('overdrive')
return self
[docs] def pad(self, start_duration: float = 0.0, end_duration: float = 0.0):
'''Add silence to the beginning or end of a file.
Calling this with the default arguments has no effect.
Parameters
----------
start_duration : float
Number of seconds of silence to add to beginning.
end_duration : float
Number of seconds of silence to add to end.
See Also
--------
delay
'''
if not is_number(start_duration) or start_duration < 0:
raise ValueError("Start duration must be a positive number.")
if not is_number(end_duration) or end_duration < 0:
raise ValueError("End duration must be positive.")
effect_args = [
'pad',
'{:f}'.format(start_duration),
'{:f}'.format(end_duration)
]
self.effects.extend(effect_args)
self.effects_log.append('pad')
return self
[docs] def phaser(self,
gain_in: float = 0.8, gain_out: float = 0.74,
delay: int = 3, decay: float = 0.4, speed: float = 0.5,
modulation_shape: Literal['sinusoidal', 'triangular'] = 'sinusoidal'):
'''Apply a phasing effect to the audio.
Parameters
----------
gain_in : float, default=0.8
Input volume between 0 and 1
gain_out: float, default=0.74
Output volume between 0 and 1
delay : float, default=3
Delay in miliseconds between 0 and 5
decay : float, default=0.4
Decay relative to gain_in, between 0.1 and 0.5.
speed : float, default=0.5
Modulation speed in Hz, between 0.1 and 2
modulation_shape : str, defaul='sinusoidal'
Modulation shpae. One of 'sinusoidal' or 'triangular'
See Also
--------
flanger, tremolo
'''
if not is_number(gain_in) or gain_in <= 0 or gain_in > 1:
raise ValueError("gain_in must be a number between 0 and 1.")
if not is_number(gain_out) or gain_out <= 0 or gain_out > 1:
raise ValueError("gain_out must be a number between 0 and 1.")
if not is_number(delay) or delay <= 0 or delay > 5:
raise ValueError("delay must be a positive number.")
if not is_number(decay) or decay < 0.1 or decay > 0.5:
raise ValueError("decay must be a number between 0.1 and 0.5.")
if not is_number(speed) or speed < 0.1 or speed > 2:
raise ValueError("speed must be a positive number.")
if modulation_shape not in ['sinusoidal', 'triangular']:
raise ValueError(
"modulation_shape must be one of 'sinusoidal', 'triangular'."
)
effect_args = [
'phaser',
'{:f}'.format(gain_in),
'{:f}'.format(gain_out),
'{:f}'.format(delay),
'{:f}'.format(decay),
'{:f}'.format(speed)
]
if modulation_shape == 'sinusoidal':
effect_args.append('-s')
elif modulation_shape == 'triangular':
effect_args.append('-t')
self.effects.extend(effect_args)
self.effects_log.append('phaser')
return self
[docs] def pitch(self, n_semitones: float, quick: bool = False):
'''Pitch shift the audio without changing the tempo.
This effect uses the WSOLA algorithm. The audio is chopped up into
segments which are then shifted in the time domain and overlapped
(cross-faded) at points where their waveforms are most similar as
determined by measurement of least squares.
Parameters
----------
n_semitones : float
The number of semitones to shift. Can be positive or negative.
quick : bool, default=False
If True, this effect will run faster but with lower sound quality.
See Also
--------
bend, speed, tempo
'''
if not is_number(n_semitones):
raise ValueError("n_semitones must be a positive number")
if n_semitones < -12 or n_semitones > 12:
logger.warning(
"Using an extreme pitch shift. "
"Quality of results will be poor"
)
if not isinstance(quick, bool):
raise ValueError("quick must be a boolean.")
effect_args = ['pitch']
if quick:
effect_args.append('-q')
effect_args.append('{:f}'.format(n_semitones * 100.))
self.effects.extend(effect_args)
self.effects_log.append('pitch')
return self
[docs] def rate(self, samplerate: float,
quality: Literal['q', 'l', 'm', 'h', 'v'] = 'h'):
'''Change the audio sampling rate (i.e. resample the audio) to any
given `samplerate`. Better the resampling quality = slower runtime.
Parameters
----------
samplerate : float
Desired sample rate.
quality : str
Resampling quality. One of:
* q : Quick - very low quality,
* l : Low,
* m : Medium,
* h : High (default),
* v : Very high
See Also
--------
upsample, downsample, convert
'''
quality_vals = ['q', 'l', 'm', 'h', 'v']
if not is_number(samplerate) or samplerate <= 0:
raise ValueError("Samplerate must be a positive number.")
if quality not in quality_vals:
raise ValueError(
"Quality must be one of {}.".format(' '.join(quality_vals))
)
effect_args = [
'rate',
'-{}'.format(quality),
'{:f}'.format(samplerate)
]
self.effects.extend(effect_args)
self.effects_log.append('rate')
return self
[docs] def remix(self,
remix_dictionary: Optional[Dict[int, List[int]]] = None,
num_output_channels: Optional[int] = None):
'''Remix the channels of an audio file.
Note: volume options are not yet implemented
Parameters
----------
remix_dictionary : dict or None
Dictionary mapping output channel to list of input channel(s).
Empty lists indicate the corresponding output channel should be
empty. If None, mixes all channels down to a single mono file.
num_output_channels : int or None
The number of channels in the output file. If None, the number of
output channels is equal to the largest key in remix_dictionary.
If remix_dictionary is None, this variable is ignored.
Examples
--------
Remix a 4-channel input file. The output file will have
input channel 2 in channel 1, a mixdown of input channels 1 an 3 in
channel 2, an empty channel 3, and a copy of input channel 4 in
channel 4.
>>> import sox
>>> tfm = sox.Transformer()
>>> remix_dictionary = {1: [2], 2: [1, 3], 4: [4]}
>>> tfm.remix(remix_dictionary)
'''
if not (isinstance(remix_dictionary, dict) or
remix_dictionary is None):
raise ValueError("remix_dictionary must be a dictionary or None.")
if remix_dictionary is not None:
if not all([isinstance(i, int) and i > 0 for i
in remix_dictionary.keys()]):
raise ValueError(
"remix dictionary must have positive integer keys."
)
if not all([isinstance(v, list) for v
in remix_dictionary.values()]):
raise ValueError("remix dictionary values must be lists.")
for v_list in remix_dictionary.values():
if not all([isinstance(v, int) and v > 0 for v in v_list]):
raise ValueError(
"elements of remix dictionary values must "
"be positive integers"
)
if not ((isinstance(num_output_channels, int) and
num_output_channels > 0) or num_output_channels is None):
raise ValueError(
"num_output_channels must be a positive integer or None."
)
effect_args = ['remix']
if remix_dictionary is None:
effect_args.append('-')
else:
if num_output_channels is None:
num_output_channels = max(remix_dictionary.keys())
for channel in range(1, num_output_channels + 1):
if channel in remix_dictionary.keys():
out_channel = ','.join(
[str(i) for i in remix_dictionary[channel]]
)
else:
out_channel = '0'
effect_args.append(out_channel)
self.effects.extend(effect_args)
self.effects_log.append('remix')
return self
[docs] def repeat(self, count: int = 1):
'''Repeat the entire audio count times.
Parameters
----------
count : int, default=1
The number of times to repeat the audio.
'''
if not isinstance(count, int) or count < 1:
raise ValueError("count must be a postive integer.")
effect_args = ['repeat', '{}'.format(count)]
self.effects.extend(effect_args)
self.effects_log.append('repeat')
[docs] def reverb(self,
reverberance: float = 50,
high_freq_damping: float = 50,
room_scale: float = 100,
stereo_depth: float = 100,
pre_delay: float = 0,
wet_gain: float = 0,
wet_only: bool = False):
'''Add reverberation to the audio using the ‘freeverb’ algorithm.
A reverberation effect is sometimes desirable for concert halls that
are too small or contain so many people that the hall’s natural
reverberance is diminished. Applying a small amount of stereo reverb
to a (dry) mono signal will usually make it sound more natural.
Parameters
----------
reverberance : float, default=50
Percentage of reverberance
high_freq_damping : float, default=50
Percentage of high-frequency damping.
room_scale : float, default=100
Scale of the room as a percentage.
stereo_depth : float, default=100
Stereo depth as a percentage.
pre_delay : float, default=0
Pre-delay in milliseconds.
wet_gain : float, default=0
Amount of wet gain in dB
wet_only : bool, default=False
If True, only outputs the wet signal.
See Also
--------
echo
'''
if (not is_number(reverberance) or reverberance < 0 or
reverberance > 100):
raise ValueError("reverberance must be between 0 and 100")
if (not is_number(high_freq_damping) or high_freq_damping < 0 or
high_freq_damping > 100):
raise ValueError("high_freq_damping must be between 0 and 100")
if (not is_number(room_scale) or room_scale < 0 or
room_scale > 100):
raise ValueError("room_scale must be between 0 and 100")
if (not is_number(stereo_depth) or stereo_depth < 0 or
stereo_depth > 100):
raise ValueError("stereo_depth must be between 0 and 100")
if not is_number(pre_delay) or pre_delay < 0:
raise ValueError("pre_delay must be a positive number")
if not is_number(wet_gain):
raise ValueError("wet_gain must be a number")
if not isinstance(wet_only, bool):
raise ValueError("wet_only must be a boolean.")
effect_args = ['reverb']
if wet_only:
effect_args.append('-w')
effect_args.extend([
'{:f}'.format(reverberance),
'{:f}'.format(high_freq_damping),
'{:f}'.format(room_scale),
'{:f}'.format(stereo_depth),
'{:f}'.format(pre_delay),
'{:f}'.format(wet_gain)
])
self.effects.extend(effect_args)
self.effects_log.append('reverb')
return self
[docs] def reverse(self):
'''Reverse the audio completely
'''
effect_args = ['reverse']
self.effects.extend(effect_args)
self.effects_log.append('reverse')
return self
[docs] def silence(self,
location: Literal[0, 1, -1] = 0,
silence_threshold: float = 0.1,
min_silence_duration: float = 0.1,
buffer_around_silence: bool = False):
'''Removes silent regions from an audio file.
Parameters
----------
location : int, default=0
Where to remove silence. One of:
* 0 to remove silence throughout the file (default),
* 1 to remove silence from the beginning,
* -1 to remove silence from the end,
silence_threshold : float, default=0.1
Silence threshold as percentage of maximum sample amplitude.
Must be between 0 and 100.
min_silence_duration : float, default=0.1
The minimum ammount of time in seconds required for a region to be
considered non-silent.
buffer_around_silence : bool, default=False
If True, leaves a buffer of min_silence_duration around removed
silent regions.
See Also
--------
vad
'''
if location not in [-1, 0, 1]:
raise ValueError("location must be one of -1, 0, 1.")
if not is_number(silence_threshold) or silence_threshold < 0:
raise ValueError(
"silence_threshold must be a number between 0 and 100"
)
elif silence_threshold >= 100:
raise ValueError(
"silence_threshold must be a number between 0 and 100"
)
if not is_number(min_silence_duration) or min_silence_duration <= 0:
raise ValueError(
"min_silence_duration must be a positive number."
)
if not isinstance(buffer_around_silence, bool):
raise ValueError("buffer_around_silence must be a boolean.")
effect_args = []
if location == -1:
effect_args.append('reverse')
if buffer_around_silence:
effect_args.extend(['silence', '-l'])
else:
effect_args.append('silence')
effect_args.extend([
'1',
'{:f}'.format(min_silence_duration),
'{:f}%'.format(silence_threshold)
])
if location == 0:
effect_args.extend([
'-1',
'{:f}'.format(min_silence_duration),
'{:f}%'.format(silence_threshold)
])
if location == -1:
effect_args.append('reverse')
self.effects.extend(effect_args)
self.effects_log.append('silence')
return self
[docs] def sinc(self,
filter_type: Literal['high', 'low', 'pass', 'reject'] = 'high',
cutoff_freq: Union[float, List[float]] = 3000,
stop_band_attenuation: float = 120,
transition_bw: Optional[Union[float, List[float]]] = None,
phase_response: Optional[float] = None):
'''Apply a sinc kaiser-windowed low-pass, high-pass, band-pass, or
band-reject filter to the signal.
Parameters
----------
filter_type : str, default='high'
Type of filter. One of:
- 'high' for a high-pass filter
- 'low' for a low-pass filter
- 'pass' for a band-pass filter
- 'reject' for a band-reject filter
cutoff_freq : float or list, default=3000
A scalar or length 2 list indicating the filter's critical
frequencies. The critical frequencies are given in Hz and must be
positive. For a high-pass or low-pass filter, cutoff_freq
must be a scalar. For a band-pass or band-reject filter, it must be
a length 2 list.
stop_band_attenuation : float, default=120
The stop band attenuation in dB
transition_bw : float, list or None, default=None
The transition band-width in Hz.
If None, sox's default of 5% of the total bandwith is used.
If a float, the given transition bandwith is used for both the
upper and lower bands (if applicable).
If a list, the first argument is used for the lower band and the
second for the upper band.
phase_response : float or None
The filter's phase response between 0 (minimum) and 100 (maximum).
If None, sox's default phase repsonse is used.
See Also
--------
band, bandpass, bandreject, highpass, lowpass
'''
filter_types = ['high', 'low', 'pass', 'reject']
if filter_type not in filter_types:
raise ValueError(
"filter_type must be one of {}".format(', '.join(filter_types))
)
if not (is_number(cutoff_freq) or isinstance(cutoff_freq, list)):
raise ValueError("cutoff_freq must be a number or a list")
if filter_type in ['high', 'low'] and isinstance(cutoff_freq, list):
raise ValueError(
"For filter types 'high' and 'low', "
"cutoff_freq must be a float, not a list"
)
if filter_type in ['pass', 'reject'] and is_number(cutoff_freq):
raise ValueError(
"For filter types 'pass' and 'reject', "
"cutoff_freq must be a list, not a float"
)
if is_number(cutoff_freq) and cutoff_freq <= 0:
raise ValueError("cutoff_freq must be a postive number")
if isinstance(cutoff_freq, list):
if len(cutoff_freq) != 2:
raise ValueError(
"If cutoff_freq is a list it may only have 2 elements."
)
if any([not is_number(f) or f <= 0 for f in cutoff_freq]):
raise ValueError(
"elements of cutoff_freq must be positive numbers"
)
cutoff_freq = sorted(cutoff_freq)
if not is_number(stop_band_attenuation) or stop_band_attenuation < 0:
raise ValueError("stop_band_attenuation must be a positive number")
if not (is_number(transition_bw) or
isinstance(transition_bw, list) or transition_bw is None):
raise ValueError("transition_bw must be a number, a list or None.")
if filter_type in ['high', 'low'] and isinstance(transition_bw, list):
raise ValueError(
"For filter types 'high' and 'low', "
"transition_bw must be a float, not a list"
)
if is_number(transition_bw) and transition_bw <= 0:
raise ValueError("transition_bw must be a postive number")
if isinstance(transition_bw, list):
if any([not is_number(f) or f <= 0 for f in transition_bw]):
raise ValueError(
"elements of transition_bw must be positive numbers"
)
if len(transition_bw) != 2:
raise ValueError(
"If transition_bw is a list it may only have 2 elements."
)
if phase_response is not None and not is_number(phase_response):
raise ValueError("phase_response must be a number or None.")
if (is_number(phase_response) and
(phase_response < 0 or phase_response > 100)):
raise ValueError("phase response must be between 0 and 100")
effect_args = ['sinc']
effect_args.extend(['-a', '{:f}'.format(stop_band_attenuation)])
if phase_response is not None:
effect_args.extend(['-p', '{:f}'.format(phase_response)])
if filter_type == 'high':
if transition_bw is not None:
effect_args.extend(['-t', '{:f}'.format(transition_bw)])
effect_args.append('{:f}'.format(cutoff_freq))
elif filter_type == 'low':
effect_args.append('-{:f}'.format(cutoff_freq))
if transition_bw is not None:
effect_args.extend(['-t', '{:f}'.format(transition_bw)])
else:
if is_number(transition_bw):
effect_args.extend(['-t', '{:f}'.format(transition_bw)])
elif isinstance(transition_bw, list):
effect_args.extend(['-t', '{:f}'.format(transition_bw[0])])
if filter_type == 'pass':
effect_args.append(
'{:f}-{:f}'.format(cutoff_freq[0], cutoff_freq[1])
)
elif filter_type == 'reject':
effect_args.append(
'{:f}-{:f}'.format(cutoff_freq[1], cutoff_freq[0])
)
if isinstance(transition_bw, list):
effect_args.extend(['-t', '{:f}'.format(transition_bw[1])])
self.effects.extend(effect_args)
self.effects_log.append('sinc')
return self
[docs] def speed(self, factor: float):
'''Adjust the audio speed (pitch and tempo together).
Technically, the speed effect only changes the sample rate information,
leaving the samples themselves untouched. The rate effect is invoked
automatically to resample to the output sample rate, using its default
quality/speed. For higher quality or higher speed resampling, in
addition to the speed effect, specify the rate effect with the desired
quality option.
Parameters
----------
factor : float
The ratio of the new speed to the old speed.
For ex. 1.1 speeds up the audio by 10%; 0.9 slows it down by 10%.
Note - this argument is the inverse of what is passed to the sox
stretch effect for consistency with speed.
See Also
--------
rate, tempo, pitch
'''
if not is_number(factor) or factor <= 0:
raise ValueError("factor must be a positive number")
if factor < 0.5 or factor > 2:
logger.warning(
"Using an extreme factor. Quality of results will be poor"
)
effect_args = ['speed', '{:f}'.format(factor)]
self.effects.extend(effect_args)
self.effects_log.append('speed')
return self
[docs] def stat(self,
input_filepath: Union[str, Path],
scale: Optional[float] = None,
rms: Optional[bool] = False):
'''Display time and frequency domain statistical information about the
audio. Audio is passed unmodified through the SoX processing chain.
Unlike other Transformer methods, this does not modify the transformer
effects chain. Instead it computes statistics on the output file that
would be created if the build command were invoked.
Note: The file is downmixed to mono prior to computation.
Parameters
----------
input_filepath : str
Path to input file to compute stats on.
scale : float or None, default=None
If not None, scales the input by the given scale factor.
rms : bool, default=False
If True, scales all values by the average rms amplitude.
Returns
-------
stat_dict : dict
Dictionary of statistics.
See Also
--------
stats, power_spectrum, sox.file_info
'''
effect_args = ['channels', '1', 'stat']
if scale is not None:
if not is_number(scale) or scale <= 0:
raise ValueError("scale must be a positive number.")
effect_args.extend(['-s', '{:f}'.format(scale)])
if rms:
effect_args.append('-rms')
_, _, stat_output = self.build(
input_filepath, '-n', extra_args=effect_args, return_output=True
)
stat_dict = {}
lines = stat_output.split('\n')
for line in lines:
split_line = line.split()
if not split_line:
continue
value = split_line[-1]
key = ' '.join(split_line[:-1])
stat_dict[key.strip(':')] = value
return stat_dict
[docs] def power_spectrum(self, input_filepath: Union[str, Path]):
'''Calculates the power spectrum (4096 point DFT). This method
internally invokes the stat command with the -freq option.
Note: The file is downmixed to mono prior to computation.
Parameters
----------
input_filepath : str
Path to input file to compute stats on.
Returns
-------
power_spectrum : list
List of frequency (Hz), amplitude pairs.
See Also
--------
stat, stats, sox.file_info
'''
effect_args = ['channels', '1', 'stat', '-freq']
_, _, stat_output = self.build(
input_filepath, '-n', extra_args=effect_args, return_output=True
)
power_spectrum = []
lines = stat_output.split('\n')
for line in lines:
split_line = line.split()
if len(split_line) != 2:
continue
freq, amp = split_line
power_spectrum.append([float(freq), float(amp)])
return power_spectrum
[docs] def stats(self, input_filepath: Union[str, Path]):
'''Display time domain statistical information about the audio
channels. Audio is passed unmodified through the SoX processing chain.
Statistics are calculated and displayed for each audio channel
Unlike other Transformer methods, this does not modify the transformer
effects chain. Instead it computes statistics on the output file that
would be created if the build command were invoked.
Note: The file is downmixed to mono prior to computation.
Parameters
----------
input_filepath : str
Path to input file to compute stats on.
Returns
-------
stats_dict : dict
List of frequency (Hz), amplitude pairs.
See Also
--------
stat, sox.file_info
'''
effect_args = ['channels', '1', 'stats']
_, _, stats_output = self.build(
input_filepath, '-n', extra_args=effect_args, return_output=True
)
stats_dict = {}
lines = stats_output.split('\n')
for line in lines:
split_line = line.split()
if len(split_line) == 0:
continue
value = split_line[-1]
key = ' '.join(split_line[:-1])
stats_dict[key] = value
return stats_dict
[docs] def stretch(self, factor: float, window: float = 20):
'''Change the audio duration (but not its pitch).
**Unless factor is close to 1, use the tempo effect instead.**
This effect is broadly equivalent to the tempo effect with search set
to zero, so in general, its results are comparatively poor; it is
retained as it can sometimes out-perform tempo for small factors.
Parameters
----------
factor : float
The ratio of the new tempo to the old tempo.
For ex. 1.1 speeds up the tempo by 10%; 0.9 slows it down by 10%.
Note - this argument is the inverse of what is passed to the sox
stretch effect for consistency with tempo.
window : float, default=20
Window size in miliseconds
See Also
--------
tempo, speed, pitch
'''
if not is_number(factor) or factor <= 0:
raise ValueError("factor must be a positive number")
if factor < 0.5 or factor > 2:
logger.warning(
"Using an extreme time stretching factor. "
"Quality of results will be poor"
)
if abs(factor - 1.0) > 0.1:
logger.warning(
"For this stretch factor, "
"the tempo effect has better performance."
)
if not is_number(window) or window <= 0:
raise ValueError(
"window must be a positive number."
)
effect_args = ['stretch', '{:f}'.format(factor), '{:f}'.format(window)]
self.effects.extend(effect_args)
self.effects_log.append('stretch')
return self
[docs] def swap(self):
'''Swap stereo channels. If the input is not stereo, pairs of channels
are swapped, and a possible odd last channel passed through.
E.g., for seven channels, the output order will be 2, 1, 4, 3, 6, 5, 7.
See Also
----------
remix
'''
effect_args = ['swap']
self.effects.extend(effect_args)
self.effects_log.append('swap')
return self
[docs] def tempo(self, factor: float,
audio_type: Optional[Literal['m', 's', 'l']] = None,
quick: bool = False):
'''Time stretch audio without changing pitch.
This effect uses the WSOLA algorithm. The audio is chopped up into
segments which are then shifted in the time domain and overlapped
(cross-faded) at points where their waveforms are most similar as
determined by measurement of least squares.
Parameters
----------
factor : float
The ratio of new tempo to the old tempo.
For ex. 1.1 speeds up the tempo by 10%; 0.9 slows it down by 10%.
audio_type : str
Type of audio, which optimizes algorithm parameters. One of:
* m : Music,
* s : Speech,
* l : Linear (useful when factor is close to 1),
quick : bool, default=False
If True, this effect will run faster but with lower sound quality.
See Also
--------
stretch, speed, pitch
'''
if not is_number(factor) or factor <= 0:
raise ValueError("factor must be a positive number")
if factor < 0.5 or factor > 2:
logger.warning(
"Using an extreme time stretching factor. "
"Quality of results will be poor"
)
if abs(factor - 1.0) <= 0.1:
logger.warning(
"For this stretch factor, "
"the stretch effect has better performance."
)
if audio_type not in [None, 'm', 's', 'l']:
raise ValueError(
"audio_type must be one of None, 'm', 's', or 'l'."
)
if not isinstance(quick, bool):
raise ValueError("quick must be a boolean.")
effect_args = ['tempo']
if quick:
effect_args.append('-q')
if audio_type is not None:
effect_args.append('-{}'.format(audio_type))
effect_args.append('{:f}'.format(factor))
self.effects.extend(effect_args)
self.effects_log.append('tempo')
return self
[docs] def treble(self, gain_db: float,
frequency: float = 3000.0,
slope: float = 0.5):
'''Boost or cut the treble (lower) frequencies of the audio using a
two-pole shelving filter with a response similar to that of a standard
hi-fi’s tone-controls. This is also known as shelving equalisation.
The filters are described in detail in
http://musicdsp.org/files/Audio-EQ-Cookbook.txt
Parameters
----------
gain_db : float
The gain at the Nyquist frequency.
For a large cut use -20, for a large boost use 20.
frequency : float, default=100.0
The filter's cutoff frequency in Hz.
slope : float, default=0.5
The steepness of the filter's shelf transition.
For a gentle slope use 0.3, and use 1.0 for a steep slope.
See Also
--------
bass, equalizer
'''
if not is_number(gain_db):
raise ValueError("gain_db must be a number")
if not is_number(frequency) or frequency <= 0:
raise ValueError("frequency must be a positive number.")
if not is_number(slope) or slope <= 0 or slope > 1.0:
raise ValueError("width_q must be a positive number.")
effect_args = [
'treble', '{:f}'.format(gain_db), '{:f}'.format(frequency),
'{:f}s'.format(slope)
]
self.effects.extend(effect_args)
self.effects_log.append('treble')
return self
[docs] def tremolo(self, speed: float = 6.0, depth: float = 40.0):
'''Apply a tremolo (low frequency amplitude modulation) effect to the
audio. The tremolo frequency in Hz is giv en by speed, and the depth
as a percentage by depth (default 40).
Parameters
----------
speed : float
Tremolo speed in Hz.
depth : float
Tremolo depth as a percentage of the total amplitude.
See Also
--------
flanger
Examples
--------
>>> tfm = sox.Transformer()
For a growl-type effect
>>> tfm.tremolo(speed=100.0)
'''
if not is_number(speed) or speed <= 0:
raise ValueError("speed must be a positive number.")
if not is_number(depth) or depth <= 0 or depth > 100:
raise ValueError("depth must be a positive number less than 100.")
effect_args = [
'tremolo',
'{:f}'.format(speed),
'{:f}'.format(depth)
]
self.effects.extend(effect_args)
self.effects_log.append('tremolo')
return self
[docs] def trim(self, start_time: float, end_time: Optional[float] = None):
'''Excerpt a clip from an audio file, given the start timestamp and end timestamp of the clip within the file, expressed in seconds. If the end timestamp is set to `None` or left unspecified, it defaults to the duration of the audio file.
Parameters
----------
start_time : float
Start time of the clip (seconds)
end_time : float or None, default=None
End time of the clip (seconds)
'''
if not is_number(start_time) or start_time < 0:
raise ValueError("start_time must be a positive number.")
effect_args = [
'trim',
'{:f}'.format(start_time)
]
if end_time is not None:
if not is_number(end_time) or end_time < 0:
raise ValueError("end_time must be a positive number.")
if start_time >= end_time:
raise ValueError("start_time must be smaller than end_time.")
effect_args.append('{:f}'.format(end_time - start_time))
self.effects.extend(effect_args)
self.effects_log.append('trim')
return self
[docs] def upsample(self, factor: int = 2):
'''Upsample the signal by an integer factor: zero-value samples are
inserted between each pair of input samples. As a result, the original
spectrum is replicated into the new frequency space (imaging) and
attenuated. The upsample effect is typically used in combination with
filtering effects.
Parameters
----------
factor : int, default=2
Integer upsampling factor.
See Also
--------
rate, downsample
'''
if not isinstance(factor, int) or factor < 1:
raise ValueError('factor must be a positive integer.')
effect_args = ['upsample', '{}'.format(factor)]
self.effects.extend(effect_args)
self.effects_log.append('upsample')
return self
[docs] def vad(self,
location: Literal[1, -1] = 1,
normalize: bool = True,
activity_threshold: float = 7.0,
min_activity_duration: float = 0.25,
initial_search_buffer: float = 1.0,
max_gap: float = 0.25,
initial_pad: float = 0.0):
'''Voice Activity Detector. Attempts to trim silence and quiet
background sounds from the ends of recordings of speech. The algorithm
currently uses a simple cepstral power measurement to detect voice, so
may be fooled by other things, especially music.
The effect can trim only from the front of the audio, so in order to
trim from the back, the reverse effect must also be used.
Parameters
----------
location : 1 or -1, default=1
If 1, trims silence from the beginning
If -1, trims silence from the end
normalize : bool, default=True
If true, normalizes audio before processing.
activity_threshold : float, default=7.0
The measurement level used to trigger activity detection. This may
need to be cahnged depending on the noise level, signal level, and
other characteristics of the input audio.
min_activity_duration : float, default=0.25
The time constant (in seconds) used to help ignore short bursts of
sound.
initial_search_buffer : float, default=1.0
The amount of audio (in seconds) to search for quieter/shorter
bursts of audio to include prior to the detected trigger point.
max_gap : float, default=0.25
The allowed gap (in seconds) between quiteter/shorter bursts of
audio to include prior to the detected trigger point
initial_pad : float, default=0.0
The amount of audio (in seconds) to preserve before the trigger
point and any found quieter/shorter bursts.
See Also
--------
silence
Examples
--------
>>> tfm = sox.Transformer()
Remove silence from the beginning of speech
>>> tfm.vad(initial_pad=0.3)
Remove silence from the end of speech
>>> tfm.vad(location=-1, initial_pad=0.2)
'''
if location not in [-1, 1]:
raise ValueError("location must be -1 or 1.")
if not isinstance(normalize, bool):
raise ValueError("normalize muse be a boolean.")
if not is_number(activity_threshold):
raise ValueError("activity_threshold must be a number.")
if not is_number(min_activity_duration) or min_activity_duration < 0:
raise ValueError("min_activity_duration must be a positive number")
if not is_number(initial_search_buffer) or initial_search_buffer < 0:
raise ValueError("initial_search_buffer must be a positive number")
if not is_number(max_gap) or max_gap < 0:
raise ValueError("max_gap must be a positive number.")
if not is_number(initial_pad) or initial_pad < 0:
raise ValueError("initial_pad must be a positive number.")
effect_args = []
if normalize:
effect_args.append('norm')
if location == -1:
effect_args.append('reverse')
effect_args.extend([
'vad',
'-t', '{:f}'.format(activity_threshold),
'-T', '{:f}'.format(min_activity_duration),
'-s', '{:f}'.format(initial_search_buffer),
'-g', '{:f}'.format(max_gap),
'-p', '{:f}'.format(initial_pad)
])
if location == -1:
effect_args.append('reverse')
self.effects.extend(effect_args)
self.effects_log.append('vad')
return self
[docs] def vol(self, gain: float,
gain_type: Literal['amplitude', 'power', 'db'] = 'amplitude',
limiter_gain: Optional[float] = None):
'''Apply an amplification or an attenuation to the audio signal.
Parameters
----------
gain : float
Interpreted according to the given `gain_type`.
If `gain_type' = 'amplitude', `gain' is a positive amplitude ratio.
If `gain_type' = 'power', `gain' is a power (voltage squared).
If `gain_type' = 'db', `gain' is in decibels.
gain_type : string, default='amplitude'
Type of gain. One of:
- 'amplitude'
- 'power'
- 'db'
limiter_gain : float or None, default=None
If specified, a limiter is invoked on peaks greater than
`limiter_gain' to prevent clipping.
`limiter_gain` should be a positive value much less than 1.
See Also
--------
gain, compand
'''
if not is_number(gain):
raise ValueError('gain must be a number.')
if limiter_gain is not None:
if (not is_number(limiter_gain) or
limiter_gain <= 0 or limiter_gain >= 1):
raise ValueError(
'limiter gain must be a positive number less than 1'
)
if gain_type in ['amplitude', 'power'] and gain < 0:
raise ValueError(
"If gain_type = amplitude or power, gain must be positive."
)
effect_args = ['vol']
effect_args.append('{:f}'.format(gain))
if gain_type == 'amplitude':
effect_args.append('amplitude')
elif gain_type == 'power':
effect_args.append('power')
elif gain_type == 'db':
effect_args.append('dB')
else:
raise ValueError('gain_type must be one of amplitude power or db')
if limiter_gain is not None:
if gain_type in ['amplitude', 'power'] and gain > 1:
effect_args.append('{:f}'.format(limiter_gain))
elif gain_type == 'db' and gain > 0:
effect_args.append('{:f}'.format(limiter_gain))
self.effects.extend(effect_args)
self.effects_log.append('vol')
return self