Commits (15)
......@@ -2,7 +2,7 @@ import sys
import numpy as np
from pythoms.mzml import mzML
from pythoms.molecule import Molecule
from pythoms.common_losses import losses, stored_dec
from pythoms.molecule.common_losses import losses, stored_dec
def com_loss(*custom_losses, dec=0):
......
__version__ = '2.1.6'
__version__ = '2.2.0'
"""
IGNORE:
Molecule class (previously "isotope pattern generator" and "MolecularFormula")
The output of this builder has been validated against values calculated by ChemCalc (www.chemcalc.org)
Negligable differences are attributed to different low value discarding techniques
(ChemCalc keeps the top 5000 peaks, this script drops values less than a threshold 5 orders of magnitude below the
maximum value)
CHANGELOG:
- added exact mass comparison
- separated fwhm calculation from sigma
- fwhm calculation now uses monoisotopic mass
- barisotope pattern now groups using the full width at half max
- gaussian isotope pattern generation now works off of rawip by default
- updated to use Progress class
- updated gaussian isotope pattern generator to automatically determine the appropriate decimal places
---2.9 INCOMPATIBLE WITH SPECTRUM v2.4 or older
- moved charge application to raw isotope pattern function
- fixed bug in validation function for charged molecules
- added support for and enabled auto-saving of molecule instances (loading and saving to .mol files)
IGNORE
"""
from .base import Molecule
from .ip import IPMolecule
from .logging import logger
__all__ = [
'Molecule',
'IPMolecule',
'logger',
]
if __name__ == '__main__': # for testing and troubleshooting
mol = IPMolecule(
'L2PdAr+I',
# charge= 2, # specify charge (if not specified in formula)
# res=1050000, # specify spectrometer resolution (default 5000)
verbose=True,
# decpl=10,
# dropmethod='threshold',
# threshold=0.00001,
# ipmethod='hybrid',
ipmethod='combinatorics',
# keepall=True,
)
mol.print_details()
......@@ -9,7 +9,7 @@ Each new entry should be a dictionary giving the molecular formula
abbrvs = {
# common chemical abbreviations
'Ac': {'C': 2, 'H': 3, 'O': 1}, # acetyl
'Ar+': {'C': 25, 'H': 21, 'P': 1}, # phosphonium tagged aryl group
'Ar+': {'C': 25, 'H': 21, 'P': 1}, # phosphonium tagged aryl group (McIndoe group special)
'Bn': {'C': 7, 'H': 7}, # benzyl
'Bu': {'C': 4, 'H': 8}, # butyl
'Cod': {'C': 8, 'H': 12}, # cyclooctadiene
......
from .multiplicative import isotope_pattern_multiplicative
from .combinatoric import isotope_pattern_combinatoric, isotope_pattern_hybrid
from .simulated import gaussian_isotope_pattern
from .isospec import isotope_pattern_isospec
from .bar import bar_isotope_pattern, VALID_DROPMETHODS, VALID_GROUP_METHODS
# valid isotope pattern generation methods
VALID_IPMETHODS = [
'combinatorics',
'multiplicative',
'hybrid',
'isospec', # uses isospecpy package
# 'cuda',
]
__all__ = [
'VALID_IPMETHODS',
'VALID_DROPMETHODS',
'VALID_GROUP_METHODS',
'isotope_pattern_multiplicative',
'isotope_pattern_combinatoric',
'isotope_pattern_hybrid',
'gaussian_isotope_pattern',
'isotope_pattern_isospec',
'bar_isotope_pattern',
]
from ...spectrum import Spectrum, weighted_average
from ..logging import logger
from ..util import group_masses, centroid
from ..settings import VERBOSE
logger = logger.getChild(__name__)
# valid dropping methods
VALID_DROPMETHODS = [
None, # no dropping
'threshold', # drop values below threshold
'npeaks', # keep top n number of peaks
# 'consolidate', # consolidate intensities
]
# valid grouping methods
VALID_GROUP_METHODS = [
'weighted',
'centroid',
]
def bar_isotope_pattern(
rawip: list,
delta: float = 0.5,
method: str = 'weighted',
verbose: bool = VERBOSE,
):
"""
Converts a raw isotope pattern into a bar isotope pattern. This groups mass defects
that are within a given difference from each other into a single *m/z* value and
intensity.
:param rawip: The raw isotope pattern with no grouping applied
:param delta: The *m/z* difference to check around a peak when grouping it into a single *m/z* value.
The script will look delta/2 from the peak being checked
:param method: Method of combining (weighted or centroid). Weighted is recommended for accuracy
:param verbose: chatty mode
:return: bar isotope pattern in ``[[m/z values],[intensity values]]`` format.
:rtype: list
"""
if method not in VALID_GROUP_METHODS:
raise ValueError(f'The grouping method {method} is invalid. Choose from {", ".join(VALID_GROUP_METHODS)}')
if verbose is True:
logger.info('generating bar isotope pattern')
if isinstance(rawip, Spectrum): # if handed a Spectrum object, trim before continuing
rawip = rawip.trim()
groupedip = group_masses(rawip, delta / 2)
out = [[], []]
for group in groupedip:
if method == 'weighted':
x, y = weighted_average(*group) # determine weighted mass and summed intensity
elif method == 'centroid':
x, y = centroid(group)
out[0].append(x)
out[1].append(y)
maxint = max(out[1])
for ind, val in enumerate(out[1]):
out[0][ind] = out[0][ind] # / abs(charge)
out[1][ind] = val / maxint * 100. # normalize to 100
if verbose is True:
logger.info('bar isotope pattern generation complete')
return out
from tqdm import tqdm
from ...spectrum import Spectrum
from ..settings import THRESHOLD, NPEAKS, CONSOLIDATE, VERBOSE
from ..logging import logger
from ..util import autodec, numberofcwr, product, num_permu, ReiterableCWR
from ..mass import mass_dict, string_to_isotope
logger = logger.getChild(__name__)
def isotope_pattern_hybrid(composition: dict,
fwhm: float,
decpl: int,
verbose: bool = VERBOSE,
dropmethod: str = None,
threshold: float = THRESHOLD,
npeaks: int = NPEAKS,
consolidate: float = CONSOLIDATE,
**kwargs,
):
"""
A hybrid isotope pattern calculator which calculates the isotope pattern from each element, then multiplies the
lists.
:param composition: composition dictionary
:param fwhm: full-width-at-half-maximum
:param decpl: decimal places to track in the Spectrum object
:param verbose: chatty mode
:param dropmethod: optional method to use for low-intensity peak dropping or consolidation. Valid options are
'threshold', 'npeaks', or 'consolidate'.
:param threshold: if the dropmethod is set to 'threshold', any peaks below this threshold will be dropped.
:param npeaks: if the dropmethod is set to 'npeaks', the top n peaks will be kept, with the rest being dropped.
:param consolidate: if the dropmethod is set to 'consolidate', any peaks below the threshold will be consolidated
into adjacent peaks using a weighted average. Any peaks that do not have a neighbour within 10^-`consolidate`
will be dropped entirely.
:return: isotope pattern as a Spectrum object
:rtype: Spectrum
"""
logger.info('generating hybrid isotope pattern')
eleips = {} # dictionary for storing the isotope patterns of each element
for element, number in composition.items():
eleips[element] = isotope_pattern_combinatoric( # calculate the isotope pattern for each element
{element: number},
decpl=decpl,
verbose=verbose,
).trim() # trim the generated spectra to lists
sortlist = []
for element in eleips:
sortlist.append((
len(eleips[element][0]),
element
))
sortlist = sorted(sortlist) # sorted list of elements based on the length of their isotope patterns
sortlist.reverse()
spec = None
# todo convert to context tqdm (update string)
for lenlist, element in tqdm(sortlist, desc='adding element to isotope pattern', disable=not verbose):
if spec is None:
spec = Spectrum(
autodec(fwhm), # decimal places
start=None, # minimum mass
end=None, # maximum mass
empty=True, # whether or not to use emptyspec
filler=0., # fill with zeros, not None
specin=eleips[element], # supply masses and abundances as initialization spectrum
)
continue
spec.add_element(eleips[element][0], eleips[element][1])
spec.normalize(100.) # normalize spectrum object
if dropmethod == 'threshold': # drop values below threshold
spec.threshold(threshold)
elif dropmethod == 'npeaks': # keep top n number of peaks
spec.keep_top_n(npeaks)
elif dropmethod == 'consolidate': # consolidate values being dropped
spec.consolidate(
threshold,
3 * 10 ** -consolidate
)
logger.info('hybrid isotope pattern generation complete')
return spec
def isotope_pattern_combinatoric(
comp: dict,
decpl: int,
verbose: bool = VERBOSE,
**kwargs, # catch for extra keyword arguments
):
"""
Calculates the raw isotope pattern of a given molecular formula with mass defects preserved.
Uses a combinatorial method to generate isotope formulae
:param comp: composition dictionary
:param decpl: decimal places to track in the Spectrum object
:param verbose: chatty mode
:return: raw isotope pattern as a Spectrum object
:rtype: Spectrum
"""
logger.info('generating combinatoric isotope pattern')
speciso = False # set state for specific isotope
isos = {} # isotopes dictionary
isosets = {} # set of isotopes for each element
iterators = [] # list of iterators
nk = []
for element in comp: # for each element
if element in mass_dict:
isosets[element] = [] # set of isotopes
for isotope in mass_dict[element]: # for each isotope of that element in the mass dictionary
if isotope != 0 and mass_dict[element][isotope][1] != 0: # of the intensity is nonzero
isosets[element].append(isotope) # track set of isotopes
isos[isotope] = element # create isotope,element association for reference
iterators.append(
ReiterableCWR( # create iterator instance
isosets[element],
comp[element]
)
)
if verbose is True:
nk.append([ # track n and k for list length output
len(isosets[element]),
comp[element]
])
else: # if it's an isotope
speciso = True
spec = Spectrum( # initiate spectrum object
decpl, # decimal places
start=None, # no minimum mass
end=None, # no maximum mass
empty=True, # whether or not to use emptyspec
filler=0., # fill with zeros, not None
)
iterations = int(cpu_list_product([numberofcwr(n, k) for n, k in nk])) # number of iterations
for comb in tqdm(product(*iterators), desc='processing isotope combination', total=iterations, disable=not verbose):
num = 1 # number of combinations counter
x = 0. # mass value
y = 1. # intensity value
for tup in comb: # for each element combination
element = isos[tup[0]] # associate isotope to element
# counts = [tup.count(x) for x in isosets[element]] # count the number of occurances of each isotope
# num *= num_permu(tup,counts) # determine the number of permutations of the set
# for ind,isotope in enumerate(isosets[element]):
# x += self.md[element][isotope][0] * counts[ind]
# y *= self.md[element][isotope][1] ** counts[ind]
num *= num_permu(tup, isosets[element]) # multiply the number by the possible permutations
for isotope in tup: # for each isotope
x += mass_dict[element][isotope][0] # shift x
y *= mass_dict[element][isotope][1] # multiply intensity
# add the x and y combination factored by the number of times that combination will occur
spec.add_value(x, y * num)
if speciso is True: # if an isotope was specified
for element in comp:
if element not in mass_dict: # if an isotope
ele, iso = string_to_isotope(element) # determine element and isotope
spec.shift_x(mass_dict[ele][iso][0] * comp[element]) # shift the x values by the isotopic mass
spec.normalize() # normalize the spectrum object
logger.info('combinatoric isotope pattern generation complete')
return spec
def cpu_list_product(iterable):
"""returns the product of a list"""
prod = 1
for n in iterable:
prod *= n
return prod
from IsoSpecPy import IsoThreshold
from ..settings import THRESHOLD, VERBOSE
from ..mass import mass_dict
from ..logging import logger
from ...spectrum import Spectrum
logger = logger.getChild(__name__)
# flag for reminding folk to cite people
_ISOSPEC_CITATION_REMINDER = False
def isotope_pattern_isospec(
comp: dict,
decpl: int,
verbose: bool = VERBOSE,
threshold: float = THRESHOLD,
**kwargs,
):
"""
Generates a raw isotope pattern using the isospecpy package. http://matteolacki.github.io/IsoSpec/
:param comp: composition dictionary
:param decpl: decimal places to track while converting from isospec to Spectrum
:param verbose: chatty mode
:param threshold: threshold level (relative, seems slightly buggy)
:param kwargs: catch for extra kwargs
:return: Spectrum object
"""
logger.info('retrieving isotope pattern from isospec')
global _ISOSPEC_CITATION_REMINDER # todo make this non-global
if _ISOSPEC_CITATION_REMINDER is False: # remind the user on the first use
print('IsoSpecPy package was used, please cite https://dx.doi.org/10.1021/acs.analchem.6b01459')
_ISOSPEC_CITATION_REMINDER = True
if any([key not in mass_dict for key in comp]):
# todo see if there's a workaround for isotope specification
raise KeyError(f'Isotope specification is not supported in IsoSpec calling. Please use a different isotope '
f'pattern generation method for isotopes. ')
# todo see if there's a way to use IsoThresholdGenerator instead
# use IsoSpec algorithm to generate configurations
iso_spec = IsoThreshold(
formula="".join(f'{ele}{num}' for ele, num in comp.items()),
threshold=threshold * 0.1,
)
spec = Spectrum(
decpl, # decimal places
start=min(iso_spec.masses) - 10 ** -decpl, # minimum mass
end=max(iso_spec.masses) + 10 ** -decpl, # maximum mass
empty=True,
filler=0. # fill with zeros, not None
)
# add values to Spectrum object
for mass, abund in zip(iso_spec.masses, iso_spec.probs):
spec.add_value(
mass,
abund
)
spec.normalize() # normalize values to 100.
logger.info('isospec isotope pattern retrieval complete')
return spec
from tqdm import tqdm
from ..settings import THRESHOLD, NPEAKS, CONSOLIDATE, VERBOSE
from ..mass import mass_dict, string_to_isotope
from ..logging import logger
from ...spectrum import Spectrum
logger = logger.getChild(__name__)
def isotope_pattern_multiplicative(
comp: dict,
decpl: int,
verbose: bool = VERBOSE,
dropmethod: str = None,
threshold: float = THRESHOLD,
npeaks: int = NPEAKS,
consolidate: float = CONSOLIDATE,
**kwargs,
):
"""
Calculates the raw isotope pattern of a given molecular formula with mass defects preserved.
:param comp: The molecular composition dictionary. See ``Molecule.composition`` for more details.
:param decpl: The number of decimal places to track. This is normally controlled by the keyword
arguments of the class, but can be specified if called separately.
:param verbose: chatty mode
:param dropmethod: optional method to use for low-intensity peak dropping or consolidation. Valid options are
'threshold', 'npeaks', or 'consolidate'.
:param threshold: if the dropmethod is set to 'threshold', any peaks below this threshold will be dropped.
:param npeaks: if the dropmethod is set to 'npeaks', the top n peaks will be kept, with the rest being dropped.
:param consolidate: if the dropmethod is set to 'consolidate', any peaks below the threshold will be consolidated
into adjacent peaks using a weighted average. Any peaks that do not have a neighbour within 10^-`consolidate`
will be dropped entirely.
:return: Returns the isotope pattern with mass defects preserved (referred to as the 'raw'
isotope pattern in this script).
:rtype: Spectrum
"""
spec = None # initial state of spec
logger.info('generating multiplicative isotope pattern')
for key in comp: # for each element
if key in mass_dict: # if not a single isotope
masses = [] # list for masses of each isotope
abunds = [] # list for abundances
for mass in mass_dict[key]:
if mass != 0:
if mass_dict[key][mass][1] > 0: # if abundance is nonzero
masses.append(mass_dict[key][mass][0])
abunds.append(mass_dict[key][mass][1])
msg = f'Processing element {key}'
for n in tqdm(range(comp[key]), desc=msg, disable=not verbose): # for n number of each element
if spec is None: # if spectrum object has not been defined
spec = Spectrum(
decpl, # decimal places
start=min(masses) - 10 ** -decpl, # minimum mass
end=max(masses) + 10 ** -decpl, # maximum mass
specin=[masses, abunds], # supply masses and abundances as initialization spectrum
empty=True, # whether or not to use emptyspec
filler=0., # fill with zeros, not None
)
continue
spec.add_element(masses, abunds) # add the element to the spectrum object
spec.normalize(100.) # normalize spectrum
if dropmethod == 'threshold': # drop values below threshold
spec.threshold(threshold)
elif dropmethod == 'npeaks': # keep top n number of peaks
spec.keep_top_n(npeaks)
elif dropmethod == 'consolidate': # consolidate values being dropped
# todo figure out what's wrong here
raise NotImplementedError("There are bugs here, for the time being don't use the 'consolidate' "
"dropmethod.")
spec.consolidate(
threshold,
3 * 10 ** -consolidate
)
else: # if specific isotope
ele, iso = string_to_isotope(key) # find element and isotope
if spec is None: # if spectrum object has not been defined
spec = Spectrum(
decpl, # decimal places
start=(mass_dict[ele][iso][0] * float(comp[key])) - 10 ** -decpl, # minimum mass
end=(mass_dict[ele][iso][0] * float(comp[key])) + 10 ** -decpl, # maximum mass
specin=[[mass_dict[ele][iso][0] * float(comp[key])], [1.]],
# supply masses and abundances as initialization spectrum
empty=True, # whether or not to use emptyspec
filler=0. # fill with zeros, not None
)
continue
# todo add tqdm progress bar
spec.shift_x(mass_dict[ele][iso][0]) # offset spectrum object by the mass of that
spec.normalize()
logger.info('multiplicative isotope pattern generation complete')
return spec
from tqdm import tqdm
from ..settings import VERBOSE
from ..util import autodec, normal_distribution
from ..logging import logger
from ...spectrum import Spectrum
def gaussian_isotope_pattern(
barip: list,
fwhm: float,
verbose: bool = VERBOSE,
):
"""
Simulates the isotope pattern that would be observed in a mass
spectrometer with the resolution specified as a fwhm value.
:param barip: The isotope pattern to be simulated. This can be either the bar isotope
pattern or the raw isotope pattern (although this will be substantially
slower for large molecules).
:param fwhm: full-width-at-half-maximum
:param verbose: chatty mode
:return: The predicted gaussian isotope pattern in the form of a paired list
``[[m/z values],[intensity values]]``
:rtype: list
"""
logger.info('generating gaussian isotope pattern')
spec = Spectrum( # generate Spectrum object to encompass the entire region
autodec(fwhm),
start=min(barip[0]) - fwhm * 2,
end=max(barip[0]) + fwhm * 2,
empty=False, # whether or not to use emptyspec
filler=0., # fill with zeros, not None
)
msg = 'calculating gaussian isotope pattern'
# generate normal distributions for each peak
for ind, val in enumerate(tqdm(barip[0], desc=msg, total=len(barip[0]), disable=not verbose)):
nd = normal_distribution(val, fwhm, barip[1][ind]) # generate normal distribution for that peak
spec.add_spectrum(nd[0], nd[1]) # add the generated spectrum to the total spectrum
spec.normalize() # normalize
gausip = spec.trim() # trim None values and output
logger.info('gaussian isotope pattern generation complete')
return gausip
\ No newline at end of file
import copy
import numpy as np
from .settings import VERBOSE
from .logging import logger
from .parsing import interpret_charge, composition_from_formula, to_subscript, to_superscript
from .mass import abbreviations, check_in_mass_dict, mass_dict, string_to_isotope
class Molecule(object):
_comp = {} # storage for composition of the molecule
_mf = ''
verbose = VERBOSE
def __init__(self,
string: (str, dict),
charge=1,
mass_key='nist_mass',
verbose=False,
):
"""
Calculates many properties of a specified molecule.
:param str, dict string: The molecule to interpret. A composition dictionary may also be specified here.
:param int, str charge: the charge of the molecule (for mass spectrometric applications).
This will affect any properties related to the mass to charge
ratio. If the charge is specified in the input molecular formula, this will be
overridden.
:param str mass_key: The mass dictionary to use for calculations. Default is nist_mass, but additional mass
dictionaries may be defined in the mass_dictionary file and retrieved using the dictionary name
used to define them.
:param bool verbose: Verbose output. Mostly useful when calculating for large molecules or while debugging.
**Notes regarding string specification**
- Common abbreviations may be predefined in _mass_abbreviations.py (either locally or in the current working
directory)
- Use brackets to signify multiples of a given component (nested brackets are supported)
- Isotopes may be specified using an isotope-element format within a bracket (e.g. carbon 13 would be specified
as "(13C)" ). The mass of that isotope must be defined in the mass dictionary being used by the script
(default NIST mass).
- The charge may be specified in the formula, but care must be taken here. Charge must be specified in either
sign-value (e.g. '+2') or within a bracket. Otherwise, the script may attempt to interpret the charge as a
magnitude specifier of the previous block or as an isotope, and errors will be encountered.
- A composition dictionary with the format `{'Element': number_of_that_element, ...}` may be provided instead
of a string formula
"""
logger.debug(f'generating {self.__class__.__name__} object from input {string}')
# split charge into value and sign
self.charge, self.sign = interpret_charge(charge)
self.mass_key = mass_key # store mass dictionary that the script will use
self.verbose = verbose
if type(string) == dict: # if a composition dictionary was provided
self.composition = string
elif type(string) == str: # set string and interpret formula
self.molecular_formula = string
else:
raise TypeError(f'The provided string type is not interpretable: {type(string)}')
def __repr__(self):
return f'{self.__class__.__name__}({self.molecular_formula})'
def __str__(self):
return self.__repr__()
def __contains__(self, item):
if type(item) == str:
return item in self._comp
elif type(item) == list or type(item) == tuple:
return all([element in self._comp for element in item])
elif type(item) == dict:
return all([
element in self._comp and self._comp[element] >= num for element, num in item.items()
])
elif isinstance(item, Molecule):
return self.__contains__(item.composition)
else:
raise TypeError(f'The item {item} is not a recognized type for containment checks. Type: {type(item)}')
def __iter__(self):
for element in self._comp:
yield element
def __getitem__(self, item):
return self._comp[item]
def __eq__(self, other):
if type(other) == dict:
return other == self._comp
elif isinstance(other, Molecule):
return other.composition == self._comp
return False
def __ne__(self, other):
return not self.__eq__(other)
def __lt__(self, other):
if type(other) == dict:
return all([
number < self._comp[element] for element, number in other.items()
])
elif isinstance(other, Molecule):
return all([
number < self._comp[element] for element, number in other.composition.items()
])
else:
raise TypeError(f'Comparison of type {type(other)} to {self.__class__} is unsupported. ')
def __le__(self, other):
return self.__eq__(other) or self.__lt__(other)
def __gt__(self, other):
if type(other) == dict:
return all([
number > self._comp[element] for element, number in other.items()
])
elif isinstance(other, Molecule):
return all([
number > self._comp[element] for element, number in other.composition.items()
])
else:
raise TypeError(f'Comparison to type {type(other)} to {self.__class__} is unsupported. ')
def __ge__(self, other):
return self.__eq__(other) or self.__gt__(other)
def __getinitargs__(self):
return (
self.composition,
f'{self.charge}{self.sign}',
self.mass_key,
self.verbose,
)
def _get_kwargs(self) -> dict:
"""gets the kwargs associated with the instance for reinstantiation"""
return {
'charge': self.charge,
'mass_key': self.mass_key,
'verbose': self.verbose,
}
def __reduce__(self):
"""pickle support"""
return (
self.__class__,
self.__getinitargs__(),
)
def __add__(self, other):
"""
Several supported addition methods:
If a valid molecular formula string is provided, that string will be added.
If another Molecule class instance is provided, the provided instance will be
added to the current instance.
"""
if type(other) is str:
other = composition_from_formula(other)
elif isinstance(other, Molecule) is True:
other = other.composition
elif type(other) == dict:
pass
else:
raise ValueError(f'Addition of {other} to {self} is invalid')
new = copy.copy(self._comp) # starter for new dictionary
for key in other:
try:
new[key] += other[key]
except KeyError:
new[key] = other[key]
return self.__class__(
new,
**self._get_kwargs(),
)
def __radd__(self, other):
return self.__add__(other)
def __iadd__(self, other):
if type(other) is str:
other = composition_from_formula(other)
elif isinstance(other, Molecule) is True:
other = other.composition
elif type(other) == dict:
pass
else:
raise ValueError(f'Addition of {other} to {self} is invalid')
new = copy.copy(self._comp) # starter for new dictionary
for key in other:
try:
new[key] += other[key]
except KeyError:
new[key] = other[key]
self.composition = new
return self
def __sub__(self, other):
"""
See __add__ for details.
Subtract has a catch for a negative number of a given element
(the minimum that can be reached is zero).
"""
if type(other) is str:
other = composition_from_formula(other)
elif isinstance(other, Molecule) is True:
other = other.composition
elif type(other) == dict:
pass
else:
raise ValueError(f'Addition of {other} to {self} is invalid')
new = copy.copy(self._comp) # starter for new dictionary
for key in other:
if new[key] - other[key] < 0 or key not in new:
raise ValueError('Subtraction of {other[key]} {key} from {self} would yield a negative number of that '
'element.')
new[key] -= other[key]
return self.__class__(
new,
**self._get_kwargs(),
)
def __rsub__(self, other):
return self.__sub__(other)
def __isub__(self, other):
if type(other) is str:
other = composition_from_formula(other)
elif isinstance(other, Molecule) is True:
other = other.composition
elif type(other) == dict:
pass
else:
raise ValueError(f'Addition of {other} to {self} is invalid')
new = copy.copy(self._comp) # starter for new dictionary
for key in other:
if new[key] - other[key] < 0 or key not in new:
raise ValueError('Subtraction of {other[key]} {key} from {self} would yield a negative number of that '
'element.')
new[key] -= other[key]
self.composition = new
return self
def __mul__(self, other):
"""allows integer multiplication of the molecular formula"""
if type(other) != int:
raise ValueError(f'Non-integer multiplication of a {self.__class__.__name__} object is unsupported')
new = copy.copy(self._comp) # starter for new dictionary
for key in new:
new[key] = new[key] * other
return self.__class__(
new,
**self._get_kwargs(),
)
def __rmul__(self, other):
return self.__mul__(other)
def __imul__(self, other):
if type(other) != int:
raise ValueError(f'Non-integer multiplication of a {self.__class__.__name__} object is unsupported')
new = copy.copy(self._comp) # starter for new dictionary
for key in new:
new[key] = new[key] * other
self.composition = new
return self
def __truediv__(self, other):
"""allows integer division of the molecular formula"""
if type(other) != int:
raise ValueError(f'Non-integer division of a {self.__class__.__name__} object is unsupported')
new = copy.copy(self._comp) # starter for new dictionary
for key in new:
newval = new[key] / other
if newval.is_integer() is False:
raise ValueError(f'Division of {new[key]} {key} by {other} yielded a non-integer number {newval}')
new[key] = int(newval)
return self.__class__(
new,
**self._get_kwargs(),
)
def __itruediv__(self, other):
if type(other) != int:
raise ValueError(f'Non-integer division of a {self.__class__.__name__} object is unsupported')
new = copy.copy(self._comp) # starter for new dictionary
for key in new:
newval = new[key] / other
if newval.is_integer() is False:
raise ValueError(f'Division of {new[key]} {key} by {other} yielded a non-integer number {newval}')
new[key] = int(newval)
self.composition = new
return self
@property
def composition(self):
"""Composition dictionary"""
return self._comp
@composition.setter
def composition(self, dct):
if type(dct) != dict:
raise TypeError('The composition must be a dictionary')
dct = copy.copy(dct)
dct = abbreviations(dct) # check for and convert abbreviations
if 'charge' in dct: # if charge was specified in the formula
self.charge, self.sign = interpret_charge(dct['charge'])
del dct['charge']
check_in_mass_dict(dct) # check in mass dictionary
self._comp = dct # set local dictionary
@property
def molecular_formula(self):
"""Molecular formula of the molecule"""
out = ''
# todo catch carbon and hydrogen isotopes first
if 'C' in self.composition: # carbon and hydrogen first according to hill formula
out += f'C{self.composition["C"]}' if self.composition['C'] > 1 else 'C'
if 'H' in self.composition:
out += f'H{self.composition["H"]}' if self.composition['H'] > 1 else 'H'
for key, val in sorted(self.composition.items()): # alphabetically otherwise
if key != 'C' and key != 'H':
if key in mass_dict:
out += f'{key}{self.composition[key]}' if self.composition[key] > 1 else f'{key}'
else: # if an isotope
ele, iso = string_to_isotope(key)
out += f'({iso}{ele})'
out += f'{self.composition[key]}' if self.composition[key] > 1 else ''
return out
@molecular_formula.setter
def molecular_formula(self, formula):
self.composition = composition_from_formula(formula)
self._mf = formula
@property
def molecular_formula_formatted(self):
"""returns the subscript-formatted molecular formula"""
out = ''
if 'C' in self.composition:
out += f'C{to_subscript(self.composition["C"]) if self.composition["C"] > 1 else "C"}'
if 'H' in self.composition:
out += f'H{to_subscript(self.composition["H"]) if self.composition["H"] > 1 else "H"}'
for key, val in sorted(self.composition.items()):
if key not in ['C', 'H']:
if key in mass_dict:
out += f'{key}{to_subscript(self.composition[key])}' if self.composition[key] > 1 else f'{key}'
else:
ele, iso = string_to_isotope(key)
out += f'{to_superscript(iso)}{ele}'
out += f'{to_subscript(self.composition[key])}' if self.composition[key] > 1 else ''
return out
@property
def sf(self):
"""legacy catch for shorthand 'string formula' attribute"""
return self.molecular_formula
@property
def molecular_weight(self):
"""Molecular weight of the molecule"""
mwout = 0
for element, number in self.composition.items():
try:
mass = mass_dict[element]
for isotope in mass:
if isotope == 0:
continue
# add every isotope times its natural abundance times the number of that element
mwout += mass[isotope][0] * mass[isotope][1] * number
except KeyError: # if isotope
ele, iso = string_to_isotope(element)
mwout += mass_dict[ele][iso][0] * number # assumes 100% abundance if specified
return mwout
@property
def mw(self):
"""legacy catch for shorthand molecular weight"""
return self.molecular_weight
@property
def percent_composition(self):
"""Elemental percent composition of the molecule"""
pcompout = {} # percent composition dictionary
for element, number in self.composition.items():
try:
mass = mass_dict[element]
for isotope in mass:
if isotope == 0:
continue
if element not in pcompout:
pcompout[element] = 0.
# add mass contributed by that element
pcompout[element] += mass[isotope][0] * mass[isotope][1] * number
except KeyError: # if isotope
ele, iso = string_to_isotope(element)
pcompout[str(iso) + ele] = mass_dict[ele][iso][0] * number
mw = self.molecular_weight
for element in pcompout: # determines the percent composition of each element
try:
pcompout[element] = pcompout[element] / mw
except ZeroDivisionError:
pcompout[element] = 0.
return pcompout
@property
def pcomp(self):
"""legacy catch for shorthand percent composition"""
return self.percent_composition
@property
def monoisotopic_mass(self):
"""An estimation of the exact mass given by the molecular formula. This is likely not accurate for high-mass
species"""
em = 0.
for element, number in self.composition.items():
try:
em += mass_dict[element][0][0] * number
except KeyError:
ele, iso = string_to_isotope(element)
em += mass_dict[ele][iso][0] * number
# # accounts for the mass of an electron (uncomment if this affects your data)
# if self.sign == '+':
# em -= (9.10938356*10**-28)*charge
# if self.sign == '-':
# em += (9.10938356*10**-28)*charge
return em / self.charge
@property
def standard_deviation_comp(self):
"""
cacluates the standard deviation of the isotope pattern of the supplied composition
this calculation is based on Rockwood and Van Orden 1996 doi: 10.1021/ac951158i
"""
stdev = 0.
for element, number in self.composition.items():
meanmass = 0
eledev = 0 # elemental deviation
mass = mass_dict[element]
for isotope in mass: # calculate weighted average mass
if isotope != 0:
meanmass += sum(mass[isotope]) # weighted average mass
for isotope in mass:
if mass != 0:
eledev += mass[isotope][1] * (mass[isotope][0] - meanmass) ** 2
stdev += eledev * number
return np.sqrt(stdev)
def print_details(self):
"""prints the details of the generated molecule"""
print(f'{self}')
print(f'formula: {self.molecular_formula}')
print(f'molecular weight: {round(self.molecular_weight, 6)}')
print(f'monoisotopic mass: {round(self.monoisotopic_mass, 6)}')
print('')
self.print_percent_composition()
def print_percent_composition(self):
"""prints the percent composition in a reader-friendly format"""
print('elemental percent composition:')
for element, percent in sorted(self.percent_composition.items()):
print(f'{element}: {percent * 100.:6.4}%')
import logging
# subpackage logger parent
logger = logging.getLogger(__name__)
import os
import importlib.util
from ._mass_abbreviations import abbrvs
from . import _mass_dictionaries
"""Mass dictionary associated with the instance"""
MASS_KEY = 'crc_mass'
mass_dict = getattr(
_mass_dictionaries,
MASS_KEY,
)
# attempt to load abbreviation dictionary from current working directory
try:
abbrv_spec = importlib.util.spec_from_file_location(
'user_abbrvs',
os.path.join(
os.getcwd(),
'user_mass_abbreviations.py'
)
)
abbrv_module = importlib.util.module_from_spec(abbrv_spec)
abbrv_spec.loader.exec_module(abbrv_module)
user_abbrvs = abbrv_module.user_abbrvs
abbrvs.update(user_abbrvs)
except FileNotFoundError: # if it can't find the file, continue with default abbreviations
pass
def element_intensity_list(element: str):
"""
Returns the non-zero element intensity for the specified element.
:param element: element key
:return: mass, intensity lists
:rtype: list
"""
if element not in mass_dict:
raise KeyError(f'The element {element} is not defined in the mass dictionary.')
ele_dict = mass_dict[element]
mass_out = []
intensity_out = []
for isotope in ele_dict:
if isotope != 0 and ele_dict[isotope][1] != 0.:
mass_out.append(ele_dict[isotope][0])
intensity_out.append(ele_dict[isotope][1])
return [mass_out, intensity_out]
def string_to_isotope(string: str):
"""
Attempts to interpret an undefined key as an isotope/element combination (e.g. "13C" becomes 'C', 13). Raises a
ValueError if the string cannot be interpreted as such.
:param string: string to interpret
:return: element, isotope
:rtype: (str, int)
"""
iso = string[0]
if iso.isdigit() is False:
raise TypeError(f'The isotope "{string}" is not a valid format. Use isotope/element format e.g. "12C"')
ele = ''
i = 1
try:
while i < len(string):
if string[i].isdigit() is True:
iso += string[i]
i += 1
if string[i].isalpha() is True:
ele += string[i]
i += 1
return ele, int(iso)
except ValueError:
raise ValueError(
f'The string "{string}" could not be interpreted as an element, isotope combination, please check'
f'your input')
def check_in_mass_dict(comp: dict):
"""
Checks for the presence of the dictionary keys in the mass dictionary. Raises a ValueError if the key is not found.
:param comp: composition dictionary
"""
for key in comp:
if key not in mass_dict:
ele, iso = string_to_isotope(key)
if ele not in mass_dict:
raise ValueError(f'The element {ele} is not defined in the mass dictionary. Please check your input.')
elif iso not in mass_dict[ele]:
raise ValueError(
f'The element "{ele}" does not have a defined isotope "{iso}" in the mass dictionary. '
f'Please check your input.'
)
def abbreviations(dct: dict):
"""
Searches for abbreviations predefined in _mass_abbreviations.py either in the pythoms package or in the current
working directory. Any found abbreviations will be added to the current dictionary.
:param dct: incoming dictionary
:return: un-abbreviated dictionary
:rtype: dict
"""
comptemp = {}
for key in dct:
if key in abbrvs: # if a common abbreviation is found in formula
for subkey in abbrvs[key]:
try:
comptemp[subkey] += abbrvs[key][subkey] * dct[key]
except KeyError:
comptemp[subkey] = abbrvs[key][subkey] * dct[key]
else:
try:
comptemp[key] += dct[key]
except KeyError:
comptemp[key] = dct[key]
return comptemp
"""module for parsing molecular formulae"""
from .mass import abbreviations
SIGNS = ['+', '-'] # charge signs
unicode_subscripts = { # subscripts values for unit representations
0: f'\u2080',
1: f'\u2081',
2: f'\u2082',
3: f'\u2083',
4: f'\u2084',
5: f'\u2085',
6: f'\u2086',
7: f'\u2087',
8: f'\u2088',
9: f'\u2089',
}
unicode_superscripts = { # superscript values for unit representations
0: f'\u2070',
1: f'\u00b9',
2: f'\u00b2',
3: f'\u00b3',
4: f'\u2074',
5: f'\u2075',
6: f'\u2076',
7: f'\u2077',
8: f'\u2078',
9: f'\u2079',
}
# valid start and end brackets
OPENING_BRACKETS = ['(', '{', '['] # opening brackets
CLOSING_BRACKETS = [')', '}', ']'] # closing brackets
def interpret(block: str):
"""
Interprets an element block, breaking it into element and number of that element.
:param block: string block describing an element
:return: composition dictionary
:rtype: dict
"""
if block[0].isdigit() is True: # if isotope number is encountered
return {block: 1}
else:
ele = block[0]
i = 0
num = ''
while i < len(block) - 1:
i += 1
if block[i].isdigit() is True: # add digits
num += block[i]
elif block[i] == ' ': # ignore spaces
continue
else:
ele += block[i]
if num == '':
num = 1
else:
num = int(num)
return {ele: num}
def interpret_charge(string: str):
"""
Interprets a charge string.
:param string: string describing the charge (e.g. '2+')
:return: charge, sign
:rtype: tuple
"""
value = ''
sign = '+' # default value for sign
if type(string) is int:
return string, sign
for ind, val in enumerate(string):
if val in SIGNS: # if val sets mode
sign = val
else: # number
value += val
if value == '': # if no number was specified (e.g. "+")
value = 1
return int(value), sign
def to_subscript(number):
"""
Converts the value to subscript characters.
:param int number: number to convert
:return: subscript
:rtype: str
"""
return ''.join(
[unicode_subscripts[int(val)] for val in str(abs(number))]
)
def to_superscript(val):
"""
Returns the integer value represented as a superscript string.
:param int val: value to represent
:return: superscript string
:rtype: str
"""
return ''.join(
[unicode_superscripts[int(val)] for val in str(abs(val))]
)
def chew_formula(formula: str):
"""
Iterates through provided formula, extracting blocks, interpreting the blocks,
and returning the formula minus the blocks.
:param formula: string formula
:return: remaining formula, interpreted block
:rtype: str, dict
"""
if formula[0].isupper() is True: # element is recognized by an uppercase letter
block = formula[0] # element block
for loc in range(len(formula)):
if loc == 0:
continue
if formula[loc].isupper() is True: # if an uppercase character is encountered
break
elif formula[loc] in OPENING_BRACKETS: # if a bracket is encountered
break
else:
block += formula[loc]
return formula[len(block):], interpret(block) # return remaining formula and the interpreted block
elif formula[0] in OPENING_BRACKETS: # if a bracket is encountered, intialize bracket interpretation
return bracket(formula)
elif formula[0].isdigit() is True: # either isotope or charge
if any([sign in formula for sign in SIGNS]): # if the block is a value-sign charge specification
return '', {'charge': formula}
for ind, val in enumerate(formula):
if formula[ind].isalpha() is True: # if isotope encountered, return that isotope with n=1
return '', {formula: 1}
elif formula[0] in SIGNS: # charge specification
return '', {'charge': formula} # assign as charge for later interpretation
else:
raise ValueError(f'An uninterpretable formula chunck was encountered: {formula}')
def bracket(form):
"""finds the string block contained within a bracket and determines the formula within that bracket"""
bracktype = OPENING_BRACKETS.index(form[0]) # sets bracket type (so close bracket can be identified)
bnum = '' # number of things indicated in the bracket
block = '' # element block
nest = 1 # counter for nesting brackets
for loc in range(len(form)): # look for close bracket
if loc == 0:
continue
elif form[loc] == OPENING_BRACKETS[bracktype]: # if a nested bracket is encountered
nest += 1
block += form[loc]
elif form[loc] == CLOSING_BRACKETS[bracktype]: # if close bracket is encountered
nest -= 1
if nest == 0:
i = loc + 1 # index of close bracket
break
else:
block += form[loc]
else:
block += form[loc]
try: # look for digits outside of the bracket
while form[i].isdigit() is True:
bnum += form[i]
i += 1
except IndexError: # if i extends past the length of the formula
pass
except UnboundLocalError: # if a close bracket was not found, i will not be defined
raise ValueError(
f'A close bracket was not encountered for the "{form[0]}" bracket in the formula segment "{form}". '
f'Please check your input molecular formula.')
lblock = len(block) + len(
bnum) + 2 # length of the internal block + the length of the number + 2 for the brackets
if bnum == '': # if no number is specified
bnum = 1
else:
bnum = int(bnum)
outdict = {}
while len(block) > 0: # chew through bracket
ftemp, tempdict = chew_formula(block)
for key in tempdict:
try:
outdict[key] += tempdict[key] * bnum
except KeyError:
outdict[key] = tempdict[key] * bnum
block = ftemp
return form[lblock:], outdict # returns remaining formula and composition of the block
def composition_from_formula(formula):
"""
Interprets a provided string as a molecular formula.
Supports nested brackets, charges, and isotopes.
:param formula: A molecular formula. Charge may be specified in the formula, but care must be taken to specify
the charge in sign-value format (e.g. '+2' if value-sign is specified, the script will attempt to interpret the
key as an isotope).
:return: A dictionary where each key is an element or isotope with its value
being the number of each of the elements or isotopes. e.g. the
molecule CH4 would have the composition ``comp = {'C':1, 'H':4}``
:rtype: dict
"""
comp = {}
while len(formula) > 0: # chew through formula
ftemp, nomdict = chew_formula(formula) # find the next block
for ele in nomdict:
try:
comp[ele] += nomdict[ele]
except KeyError:
comp[ele] = nomdict[ele]
formula = ftemp
comp = abbreviations(comp) # look for common abbreviations
return comp
# default threshold for low-intensity peak dropping
THRESHOLD = 0.01
# number of peaks to keep for low-intensity peak dropping
NPEAKS = 5000
# consolidation threshold for low-intensity peak combination
CONSOLIDATE = 3
# chatty mode
VERBOSE = False
from itertools import combinations_with_replacement as cwr
import numpy as np
import sympy as sym
from scipy import stats
def standard_deviation(fwhm):
"""determines the standard deviation for a normal distribution with the full width at half max specified"""