##
## create.py
##
## create.py contains all the user-facing functions associated with GOOSE.
## This module provides high-level interfaces for generating sequences and variants
## with specific properties. If a new function is added it should be included here
## and added to the __all__ list.
##
# If any new functions are added to create.py, you need to add them here.
__all__ = ['sequence', 'seq_by_fractions', 'seq_by_classes', 'seq_by_re', 'seq_by_rg', 'variant', 'seq_fractions']
# Import required modules and functions
from goose import goose_exceptions
from goose.backend_sequence_generation import sequence_generation
from goose.backend_variant_generation.variant_generator import VariantGenerator
from goose.backend import goose_tools
from goose.backend import parameters
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ Create \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ sequence \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ By \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ Specifying \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ Properties \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
[docs]
def sequence(length, **kwargs):
"""
Generate a disordered sequence with specified physicochemical properties.
This is the main function for creating intrinsically disordered sequences
with specific characteristics. You can specify multiple properties
simultaneously to create sequences with desired combinations of NCPR,
FCR, kappa, and hydropathy values.
Parameters
----------
length : int
Length of the desired disordered sequence. Must be between the minimum
and maximum allowed lengths as defined in the parameters module.
FCR : float, optional
Fraction of charged residues (between 0 and 1). This includes both
positively and negatively charged residues.
NCPR : float, optional
Net charge per residue (between -1 and 1). Positive values indicate
net positive charge, negative values indicate net negative charge.
hydropathy : float, optional
Mean hydropathy of the sequence (between 0 and 6.1). Higher values
indicate more hydrophobic sequences.
kappa : float, optional
Kappa value describing charge patterning (between 0 and 1). Values
closer to 1 indicate more even charge distribution.
attempts : int, optional
Number of attempts to generate the sequence. Default is 20. Higher
values increase success probability but take longer.
disorder_cutoff : float, optional
Disorder threshold for sequence validation. Sequences must have
disorder scores above this threshold. Default from parameters module.
exclude : list, optional
List of amino acid residues to exclude from sequence generation.
Cannot exclude charged residues if FCR is specified.
use_weighted_probabilities : bool, optional
Whether to use weighted amino acid probabilities. This can increase
generation success but may reduce sequence diversity. Default is False.
strict_disorder : bool, optional
Whether to use strict disorder checking. If True, all residues must
be above the disorder threshold. Default is False.
return_all_sequences : bool, optional
Whether to return all generated sequences. If False, returns only
the first successful sequence. Default is False.
custom_probabilities : dict, or string optional
Custom amino acid probabilities for sequence generation. Keys should
be single-letter amino acid codes, values should be probabilities.
String options include the specified organisms in idr_probabilities.py
These are:
'mouse', 'fly', 'neurospora', 'yeast', 'arabidopsis', 'e_coli', 'worm',
'zebrafish', 'frog', 'dictyostelium', 'human', 'unbiased', 'all'
metapredict_version : int, optional
Version of MetaPredict to use for disorder prediction. Default is 3.
max_consecutive_ordered : int, optional
Maximum number of consecutive ordered residues allowed. Default from
parameters module.
max_total_ordered : float, optional
Maximum fraction of ordered residues allowed. Default from parameters
module.
batch_size : int, optional
Number of sequences to generate in each batch. Default from parameters
module.
hydropathy_tolerance : float, optional
Tolerance for hydropathy matching. Default from parameters module.
kappa_tolerance : float, optional
Tolerance for kappa matching. Default from parameters module.
Returns
-------
str or list
Generated amino acid sequence as a string if return_all_sequences is
False, or list of sequences if return_all_sequences is True.
Raises
------
GooseInputError
If invalid parameters are provided.
GooseFail
If sequence generation fails after all attempts.
Examples
--------
>>> # Generate a 100-residue sequence with specific properties
>>> seq = sequence(100, FCR=0.3, NCPR=0.1, hydropathy=3.0)
>>>
>>> # Generate sequence excluding certain residues
>>> seq = sequence(50, exclude=['C', 'M'])
"""
# Validate sequence length and convert to int if needed
goose_tools.length_check(length)
# handle custom_probabilities
if 'custom_probabilities' in kwargs:
if kwargs['custom_probabilities'] is not None:
kwargs['custom_probabilities'] = goose_tools.handle_custom_probabilities(kwargs['custom_probabilities'])
# Handle legacy parameter name: convert 'hydrophobicity' to 'hydropathy'
if 'hydrophobicity' in kwargs:
kwargs['hydropathy'] = kwargs['hydrophobicity']
# Remove the legacy parameter name
del kwargs['hydrophobicity']
# handle legacy parameter name: convert 'cutoff' to 'disorder_cutoff'
if 'cutoff' in kwargs:
# Handle legacy parameter name: convert 'cutoff' to 'disorder_cutoff'
kwargs['disorder_cutoff'] = kwargs['cutoff']
# Remove the legacy parameter name
del kwargs['cutoff']
# Validate that only acceptable keyword arguments were passed
# Raises exception if unknown parameters are provided
goose_tools.check_valid_kwargs(kwargs, ['FCR','NCPR', 'hydropathy', 'kappa', 'disorder_cutoff',
'attempts', 'exclude', 'use_weighted_probabilities',
'strict_disorder', 'return_all_sequences', 'custom_probabilities',
'metapredict_version', 'max_consecutive_ordered',
'max_total_ordered', 'batch_size',
'hydropathy_tolerance', 'kappa_tolerance'])
# Validate and correct parameter values, setting defaults where needed
# This must be done before parameter validation to ensure correct types
kwargs = goose_tools.check_and_correct_props_kwargs(**kwargs)
# Validate that parameter values are within acceptable ranges
goose_tools.check_props_parameters(length, **kwargs)
# Validate common parameters shared across multiple functions
# Ensures proper types and ranges for basic sequence generation parameters
goose_tools.check_basic_parameters(num_attempts=kwargs['attempts'],
strict_disorder=kwargs['strict_disorder'],
disorder_cutoff=kwargs['disorder_cutoff'],
metapredict_version=kwargs['metapredict_version'],
return_all_sequences=kwargs['return_all_sequences'],
use_weighted_probabilities=kwargs['use_weighted_probabilities'],
max_consecutive_ordered=kwargs['max_consecutive_ordered'],
max_total_ordered=kwargs['max_total_ordered'],
batch_size=kwargs['batch_size'],
custom_probabilities=kwargs['custom_probabilities'],
exclude=kwargs['exclude'])
# Generate the sequence using the backend sequence generation engine
generated_seq = sequence_generation.by_properties(length, fcr=kwargs['FCR'],
ncpr=kwargs['NCPR'],
hydropathy=kwargs['hydropathy'],
kappa=kwargs['kappa'],
exclude_residues=kwargs['exclude'],
num_attempts=kwargs['attempts'],
strict_disorder=kwargs['strict_disorder'],
disorder_cutoff=kwargs['disorder_cutoff'],
metapredict_version=kwargs['metapredict_version'],
return_all_sequences=kwargs['return_all_sequences'],
use_weighted_probabilities=kwargs['use_weighted_probabilities'],
chosen_probabilities=kwargs['custom_probabilities'],
max_consecutive_ordered= kwargs['max_consecutive_ordered'],
max_total_ordered=kwargs['max_total_ordered'],
batch_size=kwargs['batch_size'],
hydropathy_tolerance=kwargs['hydropathy_tolerance'],
kappa_tolerance=kwargs['kappa_tolerance'])
# Check if sequence generation failed and raise appropriate error
if generated_seq is None:
raise goose_exceptions.GooseFail('Unable to generate sequence. Please try again with different parameters or a lower cutoff value.')
return generated_seq
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ Create \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ sequence \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ By \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ Specifying \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ Fractions \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
[docs]
def seq_by_fractions(length, **kwargs):
"""
Generate a disordered sequence with specified amino acid fractions.
This function creates intrinsically disordered sequences where you can
specify the exact fraction of each amino acid type. This provides fine-
grained control over sequence composition.
Parameters
----------
length : int
Length of the desired disordered sequence.
A, C, D, E, F, G, H, I, K, L, M, N, P, Q, R, S, T, V, W, Y : float, optional
Fraction of the sequence that should be made up of each specific
amino acid (e.g., A=0.2, Y=0.05). Values should be between 0 and 1.
The sum of all specified fractions should not exceed 1.
max_aa_fractions : dict, optional
Dictionary to override the maximum allowed fraction of any amino acid.
Keys should be single-letter amino acid codes, values should be floats
between 0 and 1. If not specified, default GOOSE thresholds are used.
disorder_cutoff : float, optional
Disorder threshold for sequence validation. Default is 0.6.
attempts : int, optional
Number of attempts to generate the sequence. Default is 100.
strict_disorder : bool, optional
Whether to use strict disorder checking. If True, all residues must
be above the disorder threshold. Default is False.
remaining_probabilities : dict, or string optional
Custom amino acid probabilities for sequence generation. Keys should
be single-letter amino acid codes, values should be probabilities.
String options include the specified organisms in idr_probabilities.py
These are:
'mouse', 'fly', 'neurospora', 'yeast', 'arabidopsis', 'e_coli', 'worm',
'zebrafish', 'frog', 'dictyostelium', 'human', 'unbiased', 'all'
return_all_sequences : bool, optional
Whether to return all generated sequences. Default is False.
metapredict_version : int, optional
Version of MetaPredict to use for disorder prediction. Default is 3.
max_consecutive_ordered : int, optional
Maximum number of consecutive ordered residues allowed.
max_total_ordered : float, optional
Maximum fraction of ordered residues allowed.
batch_size : int, optional
Number of sequences to generate in each batch.
Returns
-------
str or list
Generated amino acid sequence as a string, or list of sequences if
return_all_sequences is True.
Raises
------
GooseInputError
If invalid parameters are provided.
GooseFail
If sequence generation fails after all attempts.
Examples
--------
>>> # Generate sequence with 30% alanine and 10% glycine
>>> seq = seq_by_fractions(100, A=0.3, G=0.1)
>>>
>>> # Generate sequence with custom max fractions
>>> seq = seq_by_fractions(50, A=0.4, max_aa_fractions={'A': 0.5})
"""
# Validate sequence length and convert to int if needed
goose_tools.length_check(length)
# handle remaining_probabilities
if 'remaining_probabilities' in kwargs:
# If provided, handle custom probabilities
if kwargs['remaining_probabilities'] is not None:
kwargs['remaining_probabilities'] = goose_tools.handle_custom_probabilities(kwargs['remaining_probabilities'])
# handle legacy parameter name: convert 'cutoff' to 'disorder_cutoff'
if 'cutoff' in kwargs:
# Handle legacy parameter name: convert 'cutoff' to 'disorder_cutoff'
kwargs['disorder_cutoff'] = kwargs['cutoff']
# Remove the legacy parameter name
del kwargs['cutoff']
# Validate that only acceptable keyword arguments were passed
# Raises exception if unknown parameters are provided
goose_tools.check_valid_kwargs(kwargs, ['remaining_probabilities', 'attempts',
'strict_disorder', 'disorder_cutoff', 'max_consecutive_ordered',
'max_total_ordered','max_aa_fractions',
'A','C','D','E','F','G','H','I','K','L','M','N',
'P','Q','R','S','T','V','W','Y',
'return_all_sequences', 'metapredict_version',
'batch_size'])
# Validate and correct parameter values, setting defaults where needed
# This must be done before parameter validation to ensure correct types
kwargs = goose_tools.check_and_correct_fracs_kwargs(**kwargs)
# Extract amino acid fractions from kwargs
# Build dictionary of explicitly specified amino acid fractions
fractions = {}
for f in kwargs:
if f in ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']:
fractions[f] = kwargs[f]
# Validate that fraction parameter values are within acceptable ranges
goose_tools.check_fracs_parameters(**kwargs)
# Validate common parameters shared across multiple functions
# Ensures proper types and ranges for basic sequence generation parameters
goose_tools.check_basic_parameters(num_attempts=kwargs['attempts'],
strict_disorder=kwargs['strict_disorder'],
disorder_cutoff=kwargs['disorder_cutoff'],
metapredict_version=kwargs['metapredict_version'],
return_all_sequences=kwargs['return_all_sequences'],
max_consecutive_ordered=kwargs['max_consecutive_ordered'],
max_total_ordered=kwargs['max_total_ordered'],
batch_size=kwargs['batch_size'],
custom_probabilities=kwargs['remaining_probabilities'])
# Generate the sequence using the backend fraction-based generation engine
generated_seq = sequence_generation.by_fractions(length,fractions=fractions,
remaining_probabilities=kwargs['remaining_probabilities'],
num_attempts=kwargs['attempts'],
strict_disorder=kwargs['strict_disorder'],
disorder_cutoff=kwargs['disorder_cutoff'],
max_consecutive_ordered=kwargs['max_consecutive_ordered'],
max_total_ordered=kwargs['max_total_ordered'],
metapredict_version=kwargs['metapredict_version'],
return_all_sequences=kwargs['return_all_sequences'],
batch_size = kwargs['batch_size'])
# Return the generated sequence (no need to check for None as backend handles this)
return generated_seq
[docs]
def seq_fractions(length, **kwargs):
"""
Generate a disordered sequence with specified amino acid fractions.
This function is a backwards compatibility wrapper around seq_by_fractions.
Please use seq_by_fractions for new code.
Parameters
----------
length : int
Length of the desired disordered sequence.
**kwargs : dict
All keyword arguments are passed directly to seq_by_fractions.
See seq_by_fractions documentation for full parameter details.
Returns
-------
str or list
Generated amino acid sequence(s) - see seq_by_fractions for details.
See Also
--------
seq_by_fractions : The main function for generating sequences by fractions.
Examples
--------
>>> # Generate sequence with 30% alanine and 10% glycine
>>> seq = seq_fractions(100, A=0.3, G=0.1)
"""
return seq_by_fractions(length, **kwargs)
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ Create \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ sequence \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ By \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ classes \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
[docs]
def seq_by_classes(length: int,
aromatic: float = None,
aliphatic: float = None,
polar: float = None,
positive: float = None,
negative: float = None,
glycine: float = None,
proline: float = None,
cysteine: float = None,
histidine: float = None,
num_attempts=100, strict_disorder=False,
disorder_cutoff=parameters.DISORDER_THRESHOLD,
cutoff = None, # legacy parameter name
metapredict_version=parameters.METAPREDICT_DEFAULT_VERSION,
max_consecutive_ordered=parameters.ALLOWED_CONSECUTIVE_ORDERED,
max_total_ordered=parameters.ALLOWED_TOTAL_ORDERED_FRACTION,
remaining_probabilities=None,
max_class_fractions=None):
"""
Generate a disordered sequence with specified amino acid class fractions.
This function creates intrinsically disordered sequences where you can
specify the fraction of different amino acid classes (aromatic, aliphatic,
polar, charged, etc.) rather than individual amino acids. This provides
a higher-level approach to sequence composition control.
Parameters
----------
length : int
Length of the desired disordered sequence.
aromatic : float, optional
Fraction of aromatic amino acids (F, W, Y) in the sequence
(between 0 and 1). Default is 0.0.
aliphatic : float, optional
Fraction of aliphatic amino acids (A, I, L, V) in the sequence
(between 0 and 1). Default is 0.0.
polar : float, optional
Fraction of polar amino acids (N, Q, S, T) in the sequence
(between 0 and 1). Default is 0.0.
positive : float, optional
Fraction of positively charged amino acids (K, R) in the sequence
(between 0 and 1). Default is 0.0.
negative : float, optional
Fraction of negatively charged amino acids (D, E) in the sequence
(between 0 and 1). Default is 0.0.
glycine : float, optional
Fraction of glycine (G) in the sequence (between 0 and 1). Default is 0.0.
proline : float, optional
Fraction of proline (P) in the sequence (between 0 and 1). Default is 0.0.
cysteine : float, optional
Fraction of cysteine (C) in the sequence (between 0 and 1). Default is 0.0.
histidine : float, optional
Fraction of histidine (H) in the sequence (between 0 and 1). Default is 0.0.
num_attempts : int, optional
Number of attempts to generate the sequence. Default is 100.
strict_disorder : bool, optional
Whether to use strict disorder checking. If True, all residues must
be above the disorder threshold. Default is False.
disorder_cutoff : float, optional
Disorder threshold for sequence validation. Default from parameters module.
metapredict_version : int, optional
Version of MetaPredict to use for disorder prediction. Default is 3.
max_consecutive_ordered : int, optional
Maximum number of consecutive ordered residues allowed. Default from
parameters module.
max_total_ordered : float, optional
Maximum fraction of ordered residues allowed. Default from parameters
module.
remaining_probabilities : dict, or string optional
Custom amino acid probabilities for sequence generation. Keys should
be single-letter amino acid codes, values should be probabilities.
String options include the specified organisms in idr_probabilities.py
These are:
'mouse', 'fly', 'neurospora', 'yeast', 'arabidopsis', 'e_coli', 'worm',
'zebrafish', 'frog', 'dictyostelium', 'human', 'unbiased', 'all'
cutoff : float, optional
Legacy parameter name for disorder cutoff. If provided, it will override
the default disorder_cutoff value.
max_class_fractions : dict, optional
Dictionary to override the maximum allowed fraction of any amino acid class.
Keys should be class names ('aromatic', 'aliphatic', 'polar', 'positive',
'negative', 'glycine', 'proline', 'cysteine', 'histidine'), values should
be floats between 0 and 1. If not specified, default GOOSE thresholds are used.
Returns
-------
str
Generated amino acid sequence as a string.
Raises
------
GooseInputError
If invalid parameters are provided.
GooseFail
If sequence generation fails after all attempts.
Examples
--------
>>> # Generate sequence with 20% aromatic and 10% positive residues
>>> seq = seq_by_classes(100, aromatic=0.2, positive=0.1)
>>>
>>> # Generate sequence with multiple class constraints
>>> seq = seq_by_classes(75, aromatic=0.15, polar=0.25, glycine=0.1)
"""
# Validate sequence length and convert to int if needed
goose_tools.length_check(length)
if max_class_fractions is not None:
# overwrite values in the default max_class_fractions
original_max_class_fractions = parameters.MAX_CLASS_FRACTIONS.copy()
for key in max_class_fractions:
if key in original_max_class_fractions:
original_max_class_fractions[key] = max_class_fractions[key]
max_class_fractions = original_max_class_fractions
else:
# Use default max_class_fractions if not provided
max_class_fractions = parameters.MAX_CLASS_FRACTIONS.copy()
# Validate that the specified class fractions are within acceptable bounds
goose_tools.check_class_values(max_class_fractions, aromatic=aromatic, aliphatic=aliphatic, polar=polar,
positive=positive, negative=negative, glycine=glycine,
proline=proline, cysteine=cysteine, histidine=histidine)
# handle legacy parameter name: convert 'cutoff' to 'disorder_cutoff'
if cutoff is not None:
# Handle legacy parameter name: convert 'cutoff' to 'disorder_cutoff'
disorder_cutoff = cutoff
# handle remaining_probabilities
if remaining_probabilities is not None:
remaining_probabilities = goose_tools.handle_custom_probabilities(remaining_probabilities)
# Validate common parameters shared across multiple functions
# Ensures proper types and ranges for basic sequence generation parameters
goose_tools.check_basic_parameters(num_attempts=num_attempts,
strict_disorder=strict_disorder,
disorder_cutoff=disorder_cutoff,
metapredict_version=metapredict_version,
max_consecutive_ordered=max_consecutive_ordered,
max_total_ordered=max_total_ordered,
custom_probabilities=remaining_probabilities)
# Generate the sequence using the backend class-based generation engine
generated_seq = sequence_generation.by_class(length,
aromatic_fraction=aromatic,
aliphatic_fraction=aliphatic,
polar_fraction=polar,
positive_fraction=positive,
negative_fraction=negative,
glycine_fraction=glycine,
proline_fraction=proline,
cysteine_fraction=cysteine,
histidine_fraction=histidine,
num_attempts=num_attempts, strict_disorder=strict_disorder,
disorder_cutoff=disorder_cutoff, metapredict_version=metapredict_version,
max_consecutive_ordered=max_consecutive_ordered,
max_total_ordered=max_total_ordered,
remaining_probabilities=remaining_probabilities)
# Check if sequence generation failed and raise appropriate error
if generated_seq is None:
raise goose_exceptions.GooseFail('Unable to generate sequence. Please try again with different parameters or a lower cutoff value.')
return generated_seq
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ Create \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ sequence \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ By \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ dimensions \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/ \|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
#-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-\|/-
[docs]
def seq_by_re(length, objective_re, allowed_error=parameters.MAXIMUM_RG_RE_ERROR,
attempts=100, disorder_cutoff=parameters.DISORDER_THRESHOLD,
strict_disorder=False, reduce_pos_charged=False, exclude_aas=None,
metapredict_version=parameters.METAPREDICT_DEFAULT_VERSION,
max_consecutive_ordered=parameters.ALLOWED_CONSECUTIVE_ORDERED,
max_total_ordered=parameters.ALLOWED_TOTAL_ORDERED_FRACTION,
cutoff=None):
"""
Generate a disordered sequence with a specified end-to-end distance (Re).
This function creates intrinsically disordered sequences with a target
end-to-end distance (Re) in Angstroms. The end-to-end distance is the
average distance between the N and C termini of the sequence.
Parameters
----------
length : int
Length of the sequence to generate.
objective_re : float
Target end-to-end distance in Angstroms.
allowed_error : float, optional
Allowed error between the target and actual Re value. Default from
parameters module.
attempts : int, optional
Number of attempts to generate the sequence. Default is 20.
disorder_cutoff : float, optional
Disorder threshold for sequence validation. Default from parameters module.
strict_disorder : bool, optional
Whether to use strict disorder checking. If True, all residues must
be above the disorder threshold. Default is False.
reduce_pos_charged : bool, optional
Whether to reduce positively charged amino acids in the sequence.
Default is False. In vivo data suggests positively charged residues
may not drive sequence expansion as much as predicted by the model.
exclude_aas : list, optional
List of amino acids to exclude from the sequence. Default is None.
metapredict_version : int, optional
Version of MetaPredict to use for disorder prediction. Default is 3.
max_consecutive_ordered : int, optional
Maximum number of consecutive ordered residues allowed. Default from
parameters module.
max_total_ordered : float, optional
Maximum fraction of ordered residues allowed. Default from parameters
module.
cutoff : float, optional
Legacy parameter name for disorder cutoff. If provided, it will override
the default disorder_cutoff value.
Returns
-------
str
Generated amino acid sequence as a string.
Raises
------
GooseInputError
If the objective_re is outside the possible range for the given length,
or if other invalid parameters are provided.
GooseFail
If sequence generation fails after all attempts.
Examples
--------
>>> # Generate a 100-residue sequence with Re = 50 Å
>>> seq = seq_by_re(100, 50.0)
>>>
>>> # Generate with custom error tolerance
>>> seq = seq_by_re(75, 40.0, allowed_error=2.0)
"""
# Validate sequence length and convert to int if needed
goose_tools.length_check(length)
# Validate that the objective Rg is within the possible range for this length
min_possible_value = parameters.get_min_re(length)
max_possible_value = parameters.get_max_re(length)
if objective_re < min_possible_value or objective_re > max_possible_value:
raise goose_exceptions.GooseInputError(f'Cannot generate sequence, for length {length}, min Re = {min_possible_value}, max Re = {max_possible_value}.')
# handle legacy parameter name: convert 'cutoff' to 'disorder_cutoff'
if cutoff is not None:
# Handle legacy parameter name: convert 'cutoff' to 'disorder_cutoff'
disorder_cutoff = cutoff
# Validate that allowed error is positive
if allowed_error < 0:
raise goose_exceptions.GooseInputError('Allowed error must be a positive number.')
# Validate that reduce_pos_charged is a boolean
if not isinstance(reduce_pos_charged, bool):
raise goose_exceptions.GooseInputError('reduce_pos_charged must be a boolean value.')
# Validate common parameters shared across multiple functions
# Ensures proper types and ranges for basic sequence generation parameters
goose_tools.check_basic_parameters(num_attempts=attempts,
strict_disorder=strict_disorder,
disorder_cutoff=disorder_cutoff,
metapredict_version=metapredict_version,
max_consecutive_ordered=max_consecutive_ordered,
max_total_ordered=max_total_ordered,
exclude=exclude_aas)
# Generate the sequence using the backend dimensional constraint engine
sequence = sequence_generation.by_dimensions(
length, objective_re, rg_or_re='re',
allowed_error=allowed_error,
reduce_pos_charged=reduce_pos_charged,
exclude_aas=exclude_aas,
num_attempts=attempts,
strict_disorder=strict_disorder,
disorder_cutoff=disorder_cutoff,
metapredict_version=metapredict_version,
max_consecutive_ordered=max_consecutive_ordered,
max_total_ordered=max_total_ordered
)
return sequence
[docs]
def seq_by_rg(length, objective_rg, allowed_error=parameters.MAXIMUM_RG_RE_ERROR,
attempts=100, disorder_cutoff=parameters.DISORDER_THRESHOLD,
strict_disorder=False, reduce_pos_charged=False, exclude_aas=None,
metapredict_version=parameters.METAPREDICT_DEFAULT_VERSION,
max_consecutive_ordered=parameters.ALLOWED_CONSECUTIVE_ORDERED,
max_total_ordered=parameters.ALLOWED_TOTAL_ORDERED_FRACTION,
cutoff=None):
"""
Generate a disordered sequence with a specified radius of gyration (Rg).
This function creates intrinsically disordered sequences with a target
radius of gyration (Rg) in Angstroms. The radius of gyration is a measure
of the compactness of the sequence's ensemble of conformations.
Parameters
----------
length : int
Length of the sequence to generate.
objective_rg : float
Target radius of gyration in Angstroms.
allowed_error : float, optional
Allowed error between the target and actual Rg value. Default from
parameters module.
attempts : int, optional
Number of attempts to generate the sequence. Default is 20.
disorder_cutoff : float, optional
Disorder threshold for sequence validation. Default from parameters module.
strict_disorder : bool, optional
Whether to use strict disorder checking. If True, all residues must
be above the disorder threshold. Default is False.
reduce_pos_charged : bool, optional
Whether to reduce positively charged amino acids in the sequence.
Default is False. In vivo data suggests positively charged residues
may not drive sequence expansion as much as predicted by the model.
exclude_aas : list, optional
List of amino acids to exclude from the sequence. Default is None.
metapredict_version : int, optional
Version of MetaPredict to use for disorder prediction. Default is 3.
max_consecutive_ordered : int, optional
Maximum number of consecutive ordered residues allowed. Default from
parameters module.
max_total_ordered : float, optional
Maximum fraction of ordered residues allowed. Default from parameters
module.
cutoff : float, optional
Legacy parameter name for disorder cutoff. If provided, it will override
the default disorder_cutoff value.
Returns
-------
str
Generated amino acid sequence as a string.
Raises
------
GooseInputError
If the objective_rg is outside the possible range for the given length,
or if other invalid parameters are provided.
GooseFail
If sequence generation fails after all attempts.
Examples
--------
>>> # Generate a 100-residue sequence with Rg = 25 Å
>>> seq = seq_by_rg(100, 25.0)
>>>
>>> # Generate with reduced positive charges
>>> seq = seq_by_rg(75, 20.0, reduce_pos_charged=True)
"""
# Validate sequence length and convert to int if needed
goose_tools.length_check(length)
# Validate that the objective Rg is within the possible range for this length
min_possible_value = parameters.get_min_rg(length)
max_possible_value = parameters.get_max_rg(length)
if objective_rg < min_possible_value or objective_rg > max_possible_value:
raise goose_exceptions.GooseInputError(f'Cannot generate sequence, for length {length}, min Rg = {min_possible_value}, max Rg = {max_possible_value}.')
# handle legacy parameter name: convert 'cutoff' to 'disorder_cutoff'
if cutoff is not None:
# Handle legacy parameter name: convert 'cutoff' to 'disorder_cutoff'
disorder_cutoff = cutoff
# Validate that allowed error is positive
if allowed_error < 0:
raise goose_exceptions.GooseInputError('Allowed error must be a positive number.')
# Validate that reduce_pos_charged is a boolean
if not isinstance(reduce_pos_charged, bool):
raise goose_exceptions.GooseInputError('reduce_pos_charged must be a boolean value.')
# Validate common parameters shared across multiple functions
# Ensures proper types and ranges for basic sequence generation parameters
goose_tools.check_basic_parameters(num_attempts=attempts,
strict_disorder=strict_disorder,
disorder_cutoff=disorder_cutoff,
metapredict_version=metapredict_version,
max_consecutive_ordered=max_consecutive_ordered,
max_total_ordered=max_total_ordered,
exclude=exclude_aas)
# Generate the sequence using the backend dimensional constraint engine
sequence = sequence_generation.by_dimensions(
length, objective_rg, rg_or_re='rg',
allowed_error=allowed_error,
reduce_pos_charged=reduce_pos_charged,
exclude_aas=exclude_aas,
num_attempts=attempts,
strict_disorder=strict_disorder,
disorder_cutoff=disorder_cutoff,
metapredict_version=metapredict_version,
max_consecutive_ordered=max_consecutive_ordered,
max_total_ordered=max_total_ordered
)
return sequence
#===============================================================================
#===============================================================================
# VARIANT GENERATORS
#===============================================================================
#===============================================================================
[docs]
def variant(sequence,
variant_type,
**kwargs):
"""
Generate variants of an input sequence using various transformation methods.
This function provides a unified interface for creating sequence variants
using different algorithms. It supports shuffling, repositioning, and
property-based modifications of amino acid sequences while maintaining
disorder characteristics.
Parameters
----------
sequence : str
The amino acid sequence to generate variants from. Must be a non-empty
string containing valid amino acid codes.
variant_type : str
The type of variant to generate. Available options:
Shuffling methods:
- 'shuffle_specific_regions': Shuffle only specified regions
- 'shuffle_except_specific_regions': Shuffle all except specified regions
- 'shuffle_specific_residues': Shuffle only specific residue types
- 'shuffle_except_specific_residues': Shuffle all except specific residue types
- 'weighted_shuffle_specific_residues': Weighted shuffle of specific residues
- 'targeted_reposition_specific_residues': Reposition specific residues
Asymmetry and property methods:
- 'change_residue_asymmetry': Change residue asymmetry patterns
- 'constant_properties': Generate variant with constant properties
- 'constant_residues_and_properties': Keep specified residues and properties constant
- 'constant_properties_and_class': Generate variant with constant properties and class
- 'constant_properties_and_class_by_order': Generate variant with constant properties and class by order
Property modification methods:
- 'change_hydropathy_constant_class': Change hydropathy while keeping class constant
- 'change_fcr_minimize_class_changes': Change FCR while minimizing class changes
- 'change_ncpr_constant_class': Change NCPR while keeping class constant
- 'change_kappa': Change kappa value
- 'change_properties_minimize_differences': Change properties while minimizing differences
- 'change_any_properties': Change any combination of properties
- 'change_dimensions': Change sequence dimensions (Rg/Re)
**kwargs : dict
Additional parameters specific to the variant type. Common parameters include:
General parameters:
- num_attempts (int): Number of attempts to generate variant (default: 100)
- strict_disorder (bool): Whether to use strict disorder checking (default: False)
- disorder_cutoff (float): Disorder cutoff threshold (default: from parameters)
- metapredict_version (int): MetaPredict version to use (default: 3)
- hydropathy_tolerance (float): Hydropathy tolerance (default: from parameters)
- kappa_tolerance (float): Kappa tolerance (default: from parameters)
Variant-specific parameters:
- shuffle_regions (list): Regions to shuffle (tuple pairs of start/end positions)
- excluded_regions (list): Regions to exclude from shuffling
- target_residues (list): Specific residues to target
- excluded_residues (list): Specific residues to exclude
- shuffle_weight (float): Weight for shuffling operations
- num_changes (int): Number of changes to make
- increase_or_decrease (str): Direction of change ('increase' or 'decrease')
- exclude_residues (list): Residues to exclude from modifications
- constant_residues (list): Residues to keep constant
- target_hydropathy (float): Target hydropathy value
- target_FCR (float): Target FCR value
- target_NCPR (float): Target NCPR value
- target_kappa (float): Target kappa value
- rg_or_re (str): Whether to optimize 'rg' or 're'
- num_dim_attempts (int): Number of dimensional optimization attempts
- allowed_error (float): Allowed error for dimensional constraints
- reduce_pos_charged (bool): Whether to reduce positive charges
- exclude_aas (list): Amino acids to exclude from generation
Returns
-------
str
Generated variant sequence as a string.
Raises
------
GooseInputError
If invalid parameters are provided, including:
- Empty or invalid sequence
- Invalid variant_type
- Missing required parameters for the specified variant type
- Invalid parameter values
GooseFail
If variant generation fails after all attempts.
Examples
--------
>>> # Shuffle specific regions of a sequence
>>> original = "MSEDKQRTYHLNVAIGPKWF"
>>> variant = variant(original, 'shuffle_specific_regions',
... shuffle_regions=[(0, 5), (10, 15)])
>>>
>>> # Change hydropathy while keeping amino acid classes constant
>>> variant = variant(original, 'change_hydropathy_constant_class',
... target_hydropathy=3.5)
>>>
>>> # Generate variant with constant properties but different sequence
>>> variant = variant(original, 'constant_properties', num_attempts=50)
Notes
-----
The function uses the VariantGenerator class from the backend to perform
the actual sequence modifications. Each variant type has specific parameter
requirements - consult the documentation for detailed parameter descriptions.
Region specifications use 0-based indexing where (start, end) includes
positions from start to end-1, following Python slice conventions.
"""
# Validate that the input sequence is a non-empty string
if not isinstance(sequence, str):
raise goose_exceptions.GooseInputError('Sequence must be a string')
if len(sequence) == 0:
raise goose_exceptions.GooseInputError('Sequence cannot be empty')
# verify input sequence is a valid amino acid sequence
if not goose_tools.is_valid_sequence(sequence):
raise goose_exceptions.GooseInputError('Invalid sequence. Must contain valid amino acid codes.')
# Define all valid variant types for validation
valid_variant_types = {
'shuffle_specific_regions',
'shuffle_except_specific_regions',
'shuffle_specific_residues',
'shuffle_except_specific_residues',
'weighted_shuffle_specific_residues',
'targeted_reposition_specific_residues',
'change_residue_asymmetry',
'constant_properties',
'constant_residues_and_properties',
'constant_properties_and_class',
'constant_properties_and_class_by_order',
'change_hydropathy_constant_class',
'change_fcr_minimize_class_changes',
'change_ncpr_constant_class',
'change_kappa',
'change_properties_minimize_differences',
'change_any_properties',
'change_dimensions'
}
# Validate that the variant type is supported
if variant_type not in valid_variant_types:
raise goose_exceptions.GooseInputError(f'Invalid variant_type: {variant_type}. Must be one of: {", ".join(sorted(valid_variant_types))}')
# Extract common parameters with default values
# These parameters are used across multiple variant generation methods
common_params = {
'num_attempts': kwargs.get('num_attempts', 100),
'strict_disorder': kwargs.get('strict_disorder', False),
'disorder_cutoff': kwargs.get('disorder_cutoff', parameters.DISORDER_THRESHOLD),
'metapredict_version': kwargs.get('metapredict_version', parameters.METAPREDICT_DEFAULT_VERSION),
'hydropathy_tolerance': kwargs.get('hydropathy_tolerance', parameters.MAXIMUM_HYDRO_ERROR),
'kappa_tolerance': kwargs.get('kappa_tolerance', parameters.MAXIMUM_KAPPA_ERROR)
}
# Create VariantGenerator instance with common parameters
generator = VariantGenerator(**common_params)
# Define method dispatch mapping: variant_type -> (method_name, required_params, optional_params)
# This maps user-facing variant types to backend method names and their parameter requirements
method_dispatch = {
'shuffle_specific_regions': ('shuffle_specific_regions', ['shuffle_regions']),
'shuffle_except_specific_regions': ('shuffle_except_specific_regions', ['excluded_regions']),
'shuffle_specific_residues': ('shuffle_specific_residues', ['target_residues']),
'shuffle_except_specific_residues': ('shuffle_except_specific_residues', ['excluded_residues']),
'weighted_shuffle_specific_residues': ('weighted_shuffle_specific_residues', ['target_residues', 'shuffle_weight']),
'targeted_reposition_specific_residues': ('targeted_reposition_specific_residues', ['target_residues']),
'change_residue_asymmetry': ('change_residue_asymmetry', ['target_residues'], ['num_changes', 'increase_or_decrease']),
'constant_properties': ('constant_properties', [], ['exclude_residues']),
'constant_residues_and_properties': ('constant_residues_and_properties', ['constant_residues']),
'constant_properties_and_class': ('constant_properties_and_class', []),
'constant_properties_and_class_by_order': ('constant_properties_and_class_by_order', []),
'change_hydropathy_constant_class': ('change_hydropathy_constant_class', ['target_hydropathy']),
'change_fcr_minimize_class_changes': ('change_fcr_minimize_class_changes', ['target_FCR']),
'change_ncpr_constant_class': ('change_ncpr_constant_class', ['target_NCPR']),
'change_kappa': ('change_kappa', ['target_kappa']),
'change_properties_minimize_differences': ('change_properties_minimze_differences', [], ['target_hydropathy', 'target_kappa', 'target_FCR', 'target_NCPR']),
'change_any_properties': ('change_any_properties', [], ['target_FCR', 'target_NCPR', 'target_kappa', 'target_hydropathy']),
'change_dimensions': ('change_dimensions', ['increase_or_decrease', 'rg_or_re'], ['num_dim_attempts', 'allowed_error', 'reduce_pos_charged', 'exclude_aas'])
}
# Get method information for the requested variant type
method_info = method_dispatch[variant_type]
method_name = method_info[0]
required_params = method_info[1]
optional_params = method_info[2] if len(method_info) > 2 else []
# Validate that all required parameters are provided
for param in required_params:
if param not in kwargs:
raise goose_exceptions.GooseInputError(f'Missing required parameter for {variant_type}: {param}')
# Prepare arguments for the backend method call
method_args = {'input_sequence': sequence}
# Add required parameters to method arguments
for param in required_params:
method_args[param] = kwargs[param]
# Add optional parameters if they were provided
for param in optional_params:
if param in kwargs:
method_args[param] = kwargs[param]
# Call the appropriate backend method and handle potential errors
try:
method = getattr(generator, method_name)
result = method(**method_args)
# Check if variant generation failed
if result is None:
raise goose_exceptions.GooseFail(f'Failed to generate variant of type {variant_type}. Try adjusting parameters or increasing num_attempts.')
return result
except AttributeError:
# This should not happen if method_dispatch is correct, but catch it just in case
raise goose_exceptions.GooseInputError(f'Method {method_name} not found in VariantGenerator')
except Exception as e:
# Catch any other errors from the backend and re-raise as GooseFail
raise goose_exceptions.GooseFail(f'Error generating variant: {str(e)}')