11import numpy as np
22from bisect import bisect_left
3+ from scipy .optimize import curve_fit
4+
35
46__author__ = "Yuri E. Corilo"
57__date__ = "March 11, 2020"
@@ -107,11 +109,14 @@ def calc_dispersity_index(self):
107109 This function calculates the dispersity index of the mass feature and
108110 stores the result in the `_dispersity_index` attribute. The dispersity index is calculated as the standard
109111 deviation of the retention times that account for 50% of the cummulative intensity, starting from the most
110- intense point, as described in [1].
112+ intense point, as described in [1]. Note that this calculation is done within the integration bounds with
113+ a pad according to the window factor, where the window factor is parameterized and encapsulated in the
114+ parent LCMS object (or, if not available, defaults to 2.0 minutes before and after the apex
111115
112116 Returns
113117 -------
114- None, stores the result in the `_dispersity_index` attribute of the class.
118+ None, stores the result in the `_dispersity_index` attribute of the class and the `_normalized_dispersity_index` attribute,
119+ which is the dispersity index normalized to the total time window used for the calculation (unitless, fraction of total window).
115120
116121 Raises
117122 ------
@@ -124,25 +129,46 @@ def calc_dispersity_index(self):
124129 Organic Matter with Liquid Chromatography–Ultrahigh-Resolution Mass Spectrometry."
125130 Environmental Science & Technology 58.7 (2024): 3267-3277.
126131 """
132+ # Check if LCMSMassFeature has a parent LCMS object with a window factor
133+ if hasattr (self , "mass_spectrum_obj" ):
134+ window_min = self .mass_spectrum_obj .parameters .lc_ms .dispersity_index_window
135+ else :
136+ window_min = 3.0 # minutes
137+
127138 # Check if the EIC data is available
128139 if self .eic_list is None :
129140 raise ValueError (
130141 "EIC data are not available. Please add the EIC data first."
131142 )
132143
144+ # Define start and end of the window around the apex
145+ apex_rt = self .retention_time
146+ full_time = self ._eic_data .time
147+ full_eic = self ._eic_data .eic
148+ left_start = apex_rt - window_min
149+ right_end = apex_rt + window_min
150+
151+ # Extract the EIC data within the defined window
152+ time_mask = (full_time >= left_start ) & (full_time <= right_end )
153+ eic_subset = full_eic [time_mask ]
154+ time_subset = full_time [time_mask ]
155+
133156 # Sort the EIC data and RT data by descending intensity
134- eic_in = self .eic_list .argsort ()
135- sorted_eic = self .eic_list [eic_in [::- 1 ]]
136- sorted_rt = self .eic_rt_list [eic_in [::- 1 ]]
157+ sorted_eic = eic_subset [eic_subset .argsort ()[::- 1 ]]
158+ sorted_rt = time_subset [eic_subset .argsort ()[::- 1 ]]
137159
138160 # Calculate the dispersity index
139161 cum_sum = np .cumsum (sorted_eic ) / np .sum (sorted_eic )
140162 rt_summ = sorted_rt [np .where (cum_sum < 0.5 )]
141163 if len (rt_summ ) > 1 :
142164 d = np .std (rt_summ )
143- self ._dispersity_index = d
165+ self ._dispersity_index = d # minutes
166+ self ._normalized_dispersity_index = d / (
167+ time_subset [- 1 ] - time_subset [0 ]
168+ ) # unitless (fraction of total window used)
144169 elif len (rt_summ ) == 1 :
145170 self ._dispersity_index = 0
171+ self ._normalized_dispersity_index = 0
146172
147173 def calc_fraction_height_width (self , fraction : float ):
148174 """
@@ -258,3 +284,192 @@ def calc_tailing_factor(self, accept_estimated: bool = False):
258284 )
259285
260286 self ._tailing_factor = tailing_factor
287+
288+ def calc_gaussian_similarity (self ):
289+ """
290+ Calculate the Gaussian similarity score of the mass feature.
291+
292+ This function fits a Gaussian curve to the EIC data and evaluates
293+ the goodness of fit using R-squared. Note that this only uses data within
294+ the set integration bounds of the mass feature. A score close to 1 indicates
295+ the peak closely resembles an ideal Gaussian shape.
296+
297+ Returns
298+ -------
299+ None, stores the result in the `_gaussian_similarity` attribute of the class.
300+
301+ Raises
302+ ------
303+ ValueError
304+ If the EIC data are not available.
305+ """
306+ # Check if the EIC data is available
307+ if self .eic_list is None :
308+ raise ValueError (
309+ "EIC data are not available. Please add the EIC data first."
310+ )
311+
312+ # Get EIC data within integration bounds
313+ time_data = np .array (self .eic_rt_list )
314+ intensity_data = np .array (self .eic_list )
315+
316+ if len (time_data ) < 4 : # Need minimum points for meaningful fit
317+ self ._gaussian_similarity = np .nan
318+ return
319+
320+ # Check for valid intensity data
321+ max_intensity = np .max (intensity_data )
322+ if max_intensity == 0 :
323+ self ._gaussian_similarity = np .nan
324+ return
325+
326+ try :
327+ # Define Gaussian function
328+ def gaussian (x , amplitude , mean , stddev , baseline ):
329+ return (
330+ amplitude * np .exp (- ((x - mean ) ** 2 ) / (2 * stddev ** 2 )) + baseline
331+ )
332+
333+ # Initial parameter estimates
334+ amplitude_init = max_intensity
335+ mean_init = time_data [np .argmax (intensity_data )]
336+ stddev_init = (time_data [- 1 ] - time_data [0 ]) / 6 # Rough estimate
337+ baseline_init = np .min (intensity_data )
338+
339+ # Fit Gaussian curve
340+ popt , _ = curve_fit (
341+ gaussian ,
342+ time_data ,
343+ intensity_data ,
344+ p0 = [amplitude_init , mean_init , stddev_init , baseline_init ],
345+ maxfev = 1000 ,
346+ bounds = (
347+ [0 , time_data [0 ], 0 , 0 ], # Lower bounds
348+ [np .inf , time_data [- 1 ], np .inf , max_intensity ], # Upper bounds
349+ ),
350+ )
351+
352+ # Calculate fitted values
353+ fitted_intensities = gaussian (time_data , * popt )
354+
355+ # Calculate R-squared (coefficient of determination)
356+ ss_res = np .sum ((intensity_data - fitted_intensities ) ** 2 )
357+ ss_tot = np .sum ((intensity_data - np .mean (intensity_data )) ** 2 )
358+
359+ if ss_tot == 0 :
360+ self ._gaussian_similarity = np .nan
361+ else :
362+ r_squared = 1 - (ss_res / ss_tot )
363+ # R² should be between 0 and 1 for reasonable fits
364+ # If negative, the model is worse than the mean - treat as non-computable
365+ self ._gaussian_similarity = r_squared if r_squared >= 0 else np .nan
366+
367+ except (RuntimeError , ValueError , TypeError ):
368+ # Fitting failed, assign NaN
369+ self ._gaussian_similarity = np .nan
370+
371+ def calc_noise_score (self ):
372+ """
373+ Calculate the noise score of the mass feature separately for left and right sides.
374+
375+ This function estimates the signal-to-noise ratio by comparing the peak
376+ intensity to the baseline noise level in surrounding regions. It calculates
377+ separate scores for the left and right sides of the peak, which are stored as a tuple
378+ in the `_noise_score` attribute. The noise estimation windows are encapsulated in the
379+ parent LCMS object (or, if not available, defaults to twice the peak width on each side).
380+
381+
382+ Returns
383+ -------
384+ None, stores the result in the `_noise_score` attribute as a tuple (left_score, right_score).
385+
386+ Raises
387+ ------
388+ ValueError
389+ If the EIC data are not available.
390+ """
391+ # Check if the EIC data is available
392+ if self .eic_list is None :
393+ raise ValueError (
394+ "EIC data are not available. Please add the EIC data first."
395+ )
396+
397+ # Check if LCMSMassFeature has a parent LCMS object with a window factor
398+ if hasattr (self , "mass_spectrum_obj" ):
399+ noise_window_factor = (
400+ self .mass_spectrum_obj .parameters .lc_ms .noise_window_factor
401+ )
402+ else :
403+ noise_window_factor = 2.0 # times the peak width
404+
405+ # Get full EIC data (not just integration bounds)
406+ full_time = self ._eic_data .time
407+ full_eic = self ._eic_data .eic
408+
409+ # Get peak information
410+ apex_rt = self .retention_time
411+ peak_intensity = np .max (self .eic_list )
412+
413+ # Retrieve width from integration bounds
414+ peak_width = self .eic_rt_list [- 1 ] - self .eic_rt_list [0 ]
415+
416+ # Define noise estimation windows
417+ noise_window_size = peak_width * noise_window_factor # in minutes
418+ left_noise_start = apex_rt - peak_width - noise_window_size
419+ left_noise_end = apex_rt - peak_width
420+ right_noise_start = apex_rt + peak_width
421+ right_noise_end = apex_rt + peak_width + noise_window_size
422+
423+ # Extract noise regions
424+ left_noise_mask = (full_time >= left_noise_start ) & (
425+ full_time <= left_noise_end
426+ )
427+ right_noise_mask = (full_time >= right_noise_start ) & (
428+ full_time <= right_noise_end
429+ )
430+
431+ left_noise = full_eic [left_noise_mask ]
432+ right_noise = full_eic [right_noise_mask ]
433+
434+ # Calculate left noise score
435+ if len (left_noise ) == 0 :
436+ left_score = np .nan
437+ else :
438+ left_baseline = np .median (left_noise )
439+ left_noise_std = np .std (left_noise )
440+
441+ if left_noise_std == 0 :
442+ if peak_intensity > left_baseline :
443+ left_score = 1.0
444+ else :
445+ left_score = np .nan
446+ else :
447+ left_signal = peak_intensity - left_baseline
448+ if left_signal <= 0 :
449+ left_score = 0.0
450+ else :
451+ left_snr = left_signal / left_noise_std
452+ left_score = min (1.0 , left_snr / (left_snr + 10.0 ))
453+
454+ # Calculate right noise score
455+ if len (right_noise ) == 0 :
456+ right_score = np .nan
457+ else :
458+ right_baseline = np .median (right_noise )
459+ right_noise_std = np .std (right_noise )
460+
461+ if right_noise_std == 0 :
462+ if peak_intensity > right_baseline :
463+ right_score = 1.0
464+ else :
465+ right_score = np .nan
466+ else :
467+ right_signal = peak_intensity - right_baseline
468+ if right_signal <= 0 :
469+ right_score = 0.0
470+ else :
471+ right_snr = right_signal / right_noise_std
472+ right_score = min (1.0 , right_snr / (right_snr + 10.0 ))
473+
474+ # Store as tuple
475+ self ._noise_score = (left_score , right_score )
0 commit comments