Skip to content

Commit ec2dbe8

Browse files
committed
Merge branch 'hilic_explore' into 'master'
Add calculation of S/N and gaussian fit for LCMSMassFeatures, and ability to filter using these Closes #227 See merge request mass-spectrometry/corems!191
2 parents 93eed4a + 7940ea1 commit ec2dbe8

32 files changed

Lines changed: 21563 additions & 17834 deletions

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 3.8.2
2+
current_version = 3.9.0
33
commit = False
44
tag = False
55

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ CoreMS aims to provide
4949

5050
## Current Version
5151

52-
`3.8.2`
52+
`3.9.0`
5353

5454
***
5555

@@ -336,7 +336,7 @@ UML (unified modeling language) diagrams for Direct Infusion FT-MS and GC-MS cla
336336
337337
If you use CoreMS in your work, please use the following citation:
338338
339-
Version [3.8.2 Release on GitHub](https://github.com/EMSL-Computing/CoreMS/releases/tag/v3.8.2), archived on Zenodo:
339+
Version [3.9.0 Release on GitHub](https://github.com/EMSL-Computing/CoreMS/releases/tag/v3.9.0), archived on Zenodo:
340340
341341
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.14009575.svg)](https://doi.org/10.5281/zenodo.14009575)
342342

corems/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
__author__ = "Yuri E. Corilo"
2-
__version__ = "3.8.2"
2+
__version__ = "3.9.0"
33
import time
44
import os
55
import sys

corems/chroma_peak/calc/ChromaPeakCalc.py

Lines changed: 221 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import numpy as np
22
from bisect import bisect_left
3+
from scipy.optimize import curve_fit
4+
35

46
__author__ = "Yuri E. Corilo"
57
__date__ = "March 11, 2020"
@@ -107,11 +109,14 @@ def calc_dispersity_index(self):
107109
This function calculates the dispersity index of the mass feature and
108110
stores the result in the `_dispersity_index` attribute. The dispersity index is calculated as the standard
109111
deviation of the retention times that account for 50% of the cummulative intensity, starting from the most
110-
intense point, as described in [1].
112+
intense point, as described in [1]. Note that this calculation is done within the integration bounds with
113+
a pad according to the window factor, where the window factor is parameterized and encapsulated in the
114+
parent LCMS object (or, if not available, defaults to 2.0 minutes before and after the apex
111115
112116
Returns
113117
-------
114-
None, stores the result in the `_dispersity_index` attribute of the class.
118+
None, stores the result in the `_dispersity_index` attribute of the class and the `_normalized_dispersity_index` attribute,
119+
which is the dispersity index normalized to the total time window used for the calculation (unitless, fraction of total window).
115120
116121
Raises
117122
------
@@ -124,25 +129,46 @@ def calc_dispersity_index(self):
124129
Organic Matter with Liquid Chromatography–Ultrahigh-Resolution Mass Spectrometry."
125130
Environmental Science & Technology 58.7 (2024): 3267-3277.
126131
"""
132+
# Check if LCMSMassFeature has a parent LCMS object with a window factor
133+
if hasattr(self, "mass_spectrum_obj"):
134+
window_min = self.mass_spectrum_obj.parameters.lc_ms.dispersity_index_window
135+
else:
136+
window_min = 3.0 # minutes
137+
127138
# Check if the EIC data is available
128139
if self.eic_list is None:
129140
raise ValueError(
130141
"EIC data are not available. Please add the EIC data first."
131142
)
132143

144+
# Define start and end of the window around the apex
145+
apex_rt = self.retention_time
146+
full_time = self._eic_data.time
147+
full_eic = self._eic_data.eic
148+
left_start = apex_rt - window_min
149+
right_end = apex_rt + window_min
150+
151+
# Extract the EIC data within the defined window
152+
time_mask = (full_time >= left_start) & (full_time <= right_end)
153+
eic_subset = full_eic[time_mask]
154+
time_subset = full_time[time_mask]
155+
133156
# Sort the EIC data and RT data by descending intensity
134-
eic_in = self.eic_list.argsort()
135-
sorted_eic = self.eic_list[eic_in[::-1]]
136-
sorted_rt = self.eic_rt_list[eic_in[::-1]]
157+
sorted_eic = eic_subset[eic_subset.argsort()[::-1]]
158+
sorted_rt = time_subset[eic_subset.argsort()[::-1]]
137159

138160
# Calculate the dispersity index
139161
cum_sum = np.cumsum(sorted_eic) / np.sum(sorted_eic)
140162
rt_summ = sorted_rt[np.where(cum_sum < 0.5)]
141163
if len(rt_summ) > 1:
142164
d = np.std(rt_summ)
143-
self._dispersity_index = d
165+
self._dispersity_index = d # minutes
166+
self._normalized_dispersity_index = d / (
167+
time_subset[-1] - time_subset[0]
168+
) # unitless (fraction of total window used)
144169
elif len(rt_summ) == 1:
145170
self._dispersity_index = 0
171+
self._normalized_dispersity_index = 0
146172

147173
def calc_fraction_height_width(self, fraction: float):
148174
"""
@@ -258,3 +284,192 @@ def calc_tailing_factor(self, accept_estimated: bool = False):
258284
)
259285

260286
self._tailing_factor = tailing_factor
287+
288+
def calc_gaussian_similarity(self):
289+
"""
290+
Calculate the Gaussian similarity score of the mass feature.
291+
292+
This function fits a Gaussian curve to the EIC data and evaluates
293+
the goodness of fit using R-squared. Note that this only uses data within
294+
the set integration bounds of the mass feature. A score close to 1 indicates
295+
the peak closely resembles an ideal Gaussian shape.
296+
297+
Returns
298+
-------
299+
None, stores the result in the `_gaussian_similarity` attribute of the class.
300+
301+
Raises
302+
------
303+
ValueError
304+
If the EIC data are not available.
305+
"""
306+
# Check if the EIC data is available
307+
if self.eic_list is None:
308+
raise ValueError(
309+
"EIC data are not available. Please add the EIC data first."
310+
)
311+
312+
# Get EIC data within integration bounds
313+
time_data = np.array(self.eic_rt_list)
314+
intensity_data = np.array(self.eic_list)
315+
316+
if len(time_data) < 4: # Need minimum points for meaningful fit
317+
self._gaussian_similarity = np.nan
318+
return
319+
320+
# Check for valid intensity data
321+
max_intensity = np.max(intensity_data)
322+
if max_intensity == 0:
323+
self._gaussian_similarity = np.nan
324+
return
325+
326+
try:
327+
# Define Gaussian function
328+
def gaussian(x, amplitude, mean, stddev, baseline):
329+
return (
330+
amplitude * np.exp(-((x - mean) ** 2) / (2 * stddev**2)) + baseline
331+
)
332+
333+
# Initial parameter estimates
334+
amplitude_init = max_intensity
335+
mean_init = time_data[np.argmax(intensity_data)]
336+
stddev_init = (time_data[-1] - time_data[0]) / 6 # Rough estimate
337+
baseline_init = np.min(intensity_data)
338+
339+
# Fit Gaussian curve
340+
popt, _ = curve_fit(
341+
gaussian,
342+
time_data,
343+
intensity_data,
344+
p0=[amplitude_init, mean_init, stddev_init, baseline_init],
345+
maxfev=1000,
346+
bounds=(
347+
[0, time_data[0], 0, 0], # Lower bounds
348+
[np.inf, time_data[-1], np.inf, max_intensity], # Upper bounds
349+
),
350+
)
351+
352+
# Calculate fitted values
353+
fitted_intensities = gaussian(time_data, *popt)
354+
355+
# Calculate R-squared (coefficient of determination)
356+
ss_res = np.sum((intensity_data - fitted_intensities) ** 2)
357+
ss_tot = np.sum((intensity_data - np.mean(intensity_data)) ** 2)
358+
359+
if ss_tot == 0:
360+
self._gaussian_similarity = np.nan
361+
else:
362+
r_squared = 1 - (ss_res / ss_tot)
363+
# R² should be between 0 and 1 for reasonable fits
364+
# If negative, the model is worse than the mean - treat as non-computable
365+
self._gaussian_similarity = r_squared if r_squared >= 0 else np.nan
366+
367+
except (RuntimeError, ValueError, TypeError):
368+
# Fitting failed, assign NaN
369+
self._gaussian_similarity = np.nan
370+
371+
def calc_noise_score(self):
372+
"""
373+
Calculate the noise score of the mass feature separately for left and right sides.
374+
375+
This function estimates the signal-to-noise ratio by comparing the peak
376+
intensity to the baseline noise level in surrounding regions. It calculates
377+
separate scores for the left and right sides of the peak, which are stored as a tuple
378+
in the `_noise_score` attribute. The noise estimation windows are encapsulated in the
379+
parent LCMS object (or, if not available, defaults to twice the peak width on each side).
380+
381+
382+
Returns
383+
-------
384+
None, stores the result in the `_noise_score` attribute as a tuple (left_score, right_score).
385+
386+
Raises
387+
------
388+
ValueError
389+
If the EIC data are not available.
390+
"""
391+
# Check if the EIC data is available
392+
if self.eic_list is None:
393+
raise ValueError(
394+
"EIC data are not available. Please add the EIC data first."
395+
)
396+
397+
# Check if LCMSMassFeature has a parent LCMS object with a window factor
398+
if hasattr(self, "mass_spectrum_obj"):
399+
noise_window_factor = (
400+
self.mass_spectrum_obj.parameters.lc_ms.noise_window_factor
401+
)
402+
else:
403+
noise_window_factor = 2.0 # times the peak width
404+
405+
# Get full EIC data (not just integration bounds)
406+
full_time = self._eic_data.time
407+
full_eic = self._eic_data.eic
408+
409+
# Get peak information
410+
apex_rt = self.retention_time
411+
peak_intensity = np.max(self.eic_list)
412+
413+
# Retrieve width from integration bounds
414+
peak_width = self.eic_rt_list[-1] - self.eic_rt_list[0]
415+
416+
# Define noise estimation windows
417+
noise_window_size = peak_width * noise_window_factor # in minutes
418+
left_noise_start = apex_rt - peak_width - noise_window_size
419+
left_noise_end = apex_rt - peak_width
420+
right_noise_start = apex_rt + peak_width
421+
right_noise_end = apex_rt + peak_width + noise_window_size
422+
423+
# Extract noise regions
424+
left_noise_mask = (full_time >= left_noise_start) & (
425+
full_time <= left_noise_end
426+
)
427+
right_noise_mask = (full_time >= right_noise_start) & (
428+
full_time <= right_noise_end
429+
)
430+
431+
left_noise = full_eic[left_noise_mask]
432+
right_noise = full_eic[right_noise_mask]
433+
434+
# Calculate left noise score
435+
if len(left_noise) == 0:
436+
left_score = np.nan
437+
else:
438+
left_baseline = np.median(left_noise)
439+
left_noise_std = np.std(left_noise)
440+
441+
if left_noise_std == 0:
442+
if peak_intensity > left_baseline:
443+
left_score = 1.0
444+
else:
445+
left_score = np.nan
446+
else:
447+
left_signal = peak_intensity - left_baseline
448+
if left_signal <= 0:
449+
left_score = 0.0
450+
else:
451+
left_snr = left_signal / left_noise_std
452+
left_score = min(1.0, left_snr / (left_snr + 10.0))
453+
454+
# Calculate right noise score
455+
if len(right_noise) == 0:
456+
right_score = np.nan
457+
else:
458+
right_baseline = np.median(right_noise)
459+
right_noise_std = np.std(right_noise)
460+
461+
if right_noise_std == 0:
462+
if peak_intensity > right_baseline:
463+
right_score = 1.0
464+
else:
465+
right_score = np.nan
466+
else:
467+
right_signal = peak_intensity - right_baseline
468+
if right_signal <= 0:
469+
right_score = 0.0
470+
else:
471+
right_snr = right_signal / right_noise_std
472+
right_score = min(1.0, right_snr / (right_snr + 10.0))
473+
474+
# Store as tuple
475+
self._noise_score = (left_score, right_score)

0 commit comments

Comments
 (0)