Source code for allensdk.ephys.feature_extractor

# Allen Institute Software License - This software license is the 2-clause BSD
# license plus a third clause that prohibits redistribution for commercial
# purposes without further permission.
#
# Copyright 2015-2016. Allen Institute. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Redistributions for commercial purposes are not permitted without the
# Allen Institute's written permission.
# For purposes of this license, commercial purposes is the incorporation of the
# Allen Institute's software into anything for which you will charge fees or
# other compensation. Contact terms@alleninstitute.org for commercial licensing
# opportunities.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
import sys
import math
import numpy as np
import scipy.signal as signal
import logging

# Design notes:
# to generate an average feature file, all sweeps must have all features
# to generate a fitness score of a sweep to a feature file,, the sweep
#   must have all features in the file. If one is absent, a penalty
#   of TODO ??? will be assessed

# set of features
[docs]class EphysFeatures( object ): def __init__(self, name): # feature mean and standard deviations self.mean = {} self.stdev = {} # human-readable names for features self.glossary = {} # table indicating how to score feature # 'hit' feature exists: # 'ignore' do nothing # 'stdev' score is # stdevs from target mean # 'miss' feature absent: # 'constant' score = scoring['constant'] # 'mean_mult' score = mean * scoring['mean_mult'] # self.scoring = {} self.name = name ################################################################ # ignore scores ignore_score = { "hit": "ignore" } self.glossary["n_spikes"] = "Number of spikes" self.scoring["n_spikes"] = ignore_score ################################################################ # ignore misses ignore_miss = { "hit":"stdev", "miss":"const", "const":0 } self.glossary["adapt"] = "Adaptation index" self.scoring["adapt"] = ignore_miss self.glossary["latency"] = "Time to first spike (ms)" self.scoring["latency"] = ignore_miss ################################################################ # base miss off mean mean_score = { "hit":"stdev", "miss":"mean_mult", "mean_mult":2 } self.glossary["ISICV"] = "ISI-CV" self.scoring["ISICV"] = mean_score ################################################################ # normal scoring normal_score = { "hit":"stdev", "miss":"const", "const":20 } self.glossary["isi_avg"] = "Average ISI (ms)" self.scoring["isi_avg"] = ignore_score self.glossary["doublet"] = "Doublet ISI (ms)" self.scoring["doublet"] = normal_score self.glossary["f_fast_ahp"] = "Fast AHP (mV)" self.scoring["f_fast_ahp"] = normal_score self.glossary["f_slow_ahp"] = "Slow AHP (mV)" self.scoring["f_slow_ahp"] = normal_score self.glossary["f_slow_ahp_time"] = "Slow AHP time" self.scoring["f_slow_ahp_time"] = normal_score self.glossary["base_v"] = "Baseline voltage (mV)" self.scoring["base_v"] = normal_score #self.glossary["base_v2"] = "Baseline voltage 2 (mV)" #self.scoring["base_v2"] = normal_score #self.glossary["base_v3"] = "Baseline voltage 3 (mV)" #self.scoring["base_v3"] = normal_score ################################################################ # per spike scoring perspike_score = { "hit":"perspike", "miss":"const", "const":20, "skip_last_n":0 } self.glossary["f_peak"] = "Spike height (mV)" self.scoring["f_peak"] = perspike_score.copy() self.glossary["f_trough"] = "Spike depth (mV)" self.scoring["f_trough"] = perspike_score.copy() self.scoring["f_trough"]["skip_last_n"] = 1 # self.glossary["f_w"] = "Spike width at -30 mV (ms)" # self.scoring["f_w"] = perspike_score.copy() self.glossary["upstroke"] = "Peak upstroke (mV/ms)" self.scoring["upstroke"] = perspike_score.copy() self.glossary["upstroke_v"] = "Vm of peak upstroke (mV)" self.scoring["upstroke_v"] = perspike_score.copy() self.glossary["downstroke"] = "Peak downstroke (mV/ms)" self.scoring["downstroke"] = perspike_score.copy() self.glossary["downstroke_v"] = "Vm of peak downstroke (mV)" self.scoring["downstroke_v"] = perspike_score.copy() self.glossary["threshold"] = "Threshold voltage (mV)" self.scoring["threshold"] = perspike_score.copy() self.glossary["width"] = "Spike width at half-max (ms)" self.scoring["width"] = perspike_score.copy() self.scoring["width"]["skip_last_n"] = 1 self.glossary["thresh_ramp"] = "Change in dv/dt over first 5 mV past threshold (mV/ms)" self.scoring["thresh_ramp"] = perspike_score.copy() ################################################################ # heavily penalize when there are no spikes spike_score = { "hit":"stdev", "miss":"const", "const":250 } self.glossary["rate"] = "Firing rate (Hz)" self.scoring["rate"] = spike_score
[docs] def print_out(self): print("Features from " + self.name) for k in self.mean.keys(): if k in self.glossary: st = "%30s = " % self.glossary[k] if self.mean[k] is not None: st += "%g" % self.mean[k] else: st += "--------" if k in self.stdev and self.stdev[k] is not None: st += " +/- %g" % self.stdev[k] print(st)
# initialize summary feature set from file
[docs] def clone(self, param_dict): for k in param_dict.keys(): self.mean[k] = param_dict[k]["mean"] self.stdev[k] = param_dict[k]["stdev"]
[docs]class EphysFeatureExtractor( object ): def __init__(self): # list of feature set instances self.feature_list = [] # names of each element in feature list self.feature_source = [] # feature set object representing combination of all instances self.summary = None # adds new feature set instance to feature_list
[docs] def process_instance(self, name, v, curr, t, onset, dur, stim_name): feature = EphysFeatures(name) ################################################################ # set stop time -- run until end of stimulus or end of sweep # comment-out the one of the two approaches # detect spikes only during stimulus start = onset stop = onset + dur # detect spikes for all of sweep #start = 0 #stop = t[-1] ################################################################ # pull out spike times # calculate the derivative only within target window # otherwise get spurious detection at ends of stimuli # filter with 10kHz cutoff if constant 200kHz sample rate (ie experimental trace) start_idx = np.where(t >= start)[0][0] stop_idx = np.where(t >= stop)[0][0] v_target = v[start_idx:stop_idx] if np.abs(t[1] - t[0] - 5e-6) < 1e-7 and np.var(np.diff(t)) < 1e-6: b, a = signal.bessel(4, 0.1, "low") smooth_v = signal.filtfilt(b, a, v_target, axis=0) dv = np.diff(smooth_v) else: dv = np.diff(v_target) dvdt = dv / (np.diff(t[start_idx:stop_idx]) * 1e3) # in mV/ms dv_cutoff = 20 thresh_pct = 0.05 spikes = [] temp_spk_idxs = np.where(np.diff(np.greater_equal(dvdt, dv_cutoff).astype(int)) == 1)[0] # find positive-going crossings of 100 mV/ms spk_idxs = [] for i, temp in enumerate(temp_spk_idxs): if i == 0: spk_idxs.append(temp) elif np.any(dvdt[temp_spk_idxs[i - 1]:temp] < 0): # check if the dvdt has gone back down below zero between presumed spike times # sometimes the dvdt bobbles around detection threshold and produces spurious guesses at spike times spk_idxs.append(temp) spk_idxs += start_idx # set back to the "index space" of the original trace # recalculate full dv/dt for feature analysis (vs spike detection) if np.abs(t[1] - t[0] - 5e-6) < 1e-7 and np.var(np.diff(t)) < 1e-6: b, a = signal.bessel(4, 0.1, "low") smooth_v = signal.filtfilt(b, a, v, axis=0) dv = np.diff(smooth_v) else: dv = np.diff(v) dvdt = dv / (np.diff(t) * 1e3) # in mV/ms # First time through, accumulate upstrokes to calculate average threshold target for spk_n, spk_idx in enumerate(spk_idxs): # Etay defines spike as time of threshold crossing spk = {} if spk_n < len(spk_idxs) - 1: next_idx = spk_idxs[spk_n + 1] else: next_idx = stop_idx if spk_n > 0: prev_idx = spk_idxs[spk_n - 1] else: prev_idx = start_idx # Find the peak peak_idx = np.argmax(v[spk_idx:next_idx]) + spk_idx spk["peak_idx"] = peak_idx spk["f_peak"] = v[peak_idx] spk["f_peak_i"] = curr[peak_idx] spk["f_peak_t"] = t[peak_idx] # Check if end of stimulus interval cuts off spike - if so, don't process spike if spk_n == len(spk_idxs) - 1 and peak_idx == next_idx-1: continue if spk_idx == peak_idx: continue # this was bugfix, but why? ramp? # Determine maximum upstroke of spike upstroke_idx = np.argmax(dvdt[spk_idx:peak_idx]) + spk_idx spk["upstroke"] = dvdt[upstroke_idx] if np.isnan(spk["upstroke"]): # sometimes dvdt will be NaN because of multiple cvode points at same time step close_idx = upstroke_idx + 1 while (np.isnan(dvdt[close_idx])): close_idx += 1 spk["upstroke_idx"] = close_idx spk["upstroke"] = dvdt[close_idx] spk["upstroke_v"] = v[close_idx] spk["upstroke_i"] = curr[close_idx] spk["upstroke_t"] = t[close_idx] else: spk["upstroke_idx"] = upstroke_idx spk["upstroke_v"] = v[upstroke_idx] spk["upstroke_i"] = curr[upstroke_idx] spk["upstroke_t"] = t[upstroke_idx] # Preliminarily define threshold where dvdt = 5% * max upstroke thresh_pct = 0.05 find_thresh_idxs = np.where(dvdt[prev_idx:upstroke_idx] <= thresh_pct * spk["upstroke"])[0] if len(find_thresh_idxs) < 1: # Can't find a good threshold value - probably a bad simulation case # Fall back to the upstroke value threshold_idx = upstroke_idx else: threshold_idx = find_thresh_idxs[-1] + prev_idx spk["threshold_idx"] = threshold_idx spk["threshold"] = v[threshold_idx] spk["threshold_v"] = v[threshold_idx] spk["threshold_i"] = curr[threshold_idx] spk["threshold_t"] = t[threshold_idx] spk["rise_time"] = spk["f_peak_t"] - spk["threshold_t"] PERIOD = t[1] - t[0] width_volts = (v[peak_idx] + v[threshold_idx]) / 2 recording_width = False for i in range(threshold_idx, min(len(v), threshold_idx + int(0.001 / PERIOD))): if not recording_width and v[i] >= width_volts: recording_width = True idx0 = i elif recording_width and v[i] < width_volts: spk["half_height_width"] = t[i] - t[idx0] break # </KEITH> # Check for things that are probably not spikes: # if there is more than 2 ms between the detection event and the peak, don't count it if t[peak_idx] - t[threshold_idx] > 0.002: continue # if the "spike" is less than 2 mV, don't count it if v[peak_idx] - v[threshold_idx] < 2.0: continue # if the absolute value of the peak is less than -30 mV, don't count it if v[peak_idx] < -30.0: continue spikes.append(spk) # Refine threshold target based on average of all spikes if len(spikes) > 0: threshold_target = np.array([spk["upstroke"] for spk in spikes]).mean() * thresh_pct for spk_n, spk in enumerate(spikes): if spk_n < len(spikes) - 1: next_idx = spikes[spk_n + 1]["threshold_idx"] else: next_idx = stop_idx if spk_n > 0: prev_idx = spikes[spk_n - 1]["peak_idx"] else: prev_idx = start_idx # Restore variables from before # peak_idx = spk['peak_idx'] peak_idx = np.argmax(v[spk['threshold_idx']:next_idx]) + spk['threshold_idx'] spk["peak_idx"] = peak_idx spk["f_peak"] = v[peak_idx] spk["f_peak_i"] = curr[peak_idx] spk["f_peak_t"] = t[peak_idx] # Determine maximum upstroke of spike # upstroke_idx = spk['upstroke_idx'] upstroke_idx = np.argmax(dvdt[spk['threshold_idx']:peak_idx]) + spk['threshold_idx'] spk["upstroke"] = dvdt[upstroke_idx] if np.isnan(spk["upstroke"]): # sometimes dvdt will be NaN because of multiple cvode points at same time step close_idx = upstroke_idx + 1 while (np.isnan(dvdt[close_idx])): close_idx += 1 spk["upstroke_idx"] = close_idx spk["upstroke"] = dvdt[close_idx] spk["upstroke_v"] = v[close_idx] spk["upstroke_i"] = curr[close_idx] spk["upstroke_t"] = t[close_idx] else: spk["upstroke_idx"] = upstroke_idx spk["upstroke_v"] = v[upstroke_idx] spk["upstroke_i"] = curr[upstroke_idx] spk["upstroke_t"] = t[upstroke_idx] # Find threshold based on average target find_thresh_idxs = np.where(dvdt[prev_idx:upstroke_idx] <= threshold_target)[0] if len(find_thresh_idxs) < 1: # Can't find a good threshold value - probably a bad simulation case # Fall back to the upstroke value threshold_idx = upstroke_idx else: threshold_idx = find_thresh_idxs[-1] + prev_idx spk["threshold_idx"] = threshold_idx spk["threshold"] = v[threshold_idx] spk["threshold_v"] = v[threshold_idx] spk["threshold_i"] = curr[threshold_idx] spk["threshold_t"] = t[threshold_idx] # Define the spike time as threshold time spk["t_idx"] = threshold_idx spk["t"] = t[threshold_idx] # Save the -30 mV crossing time for backward compatibility with Etay code overn30_idxs = np.where(v[threshold_idx:peak_idx] >= -30)[0] if len(overn30_idxs) > 0: spk["t_idx_n30"] = overn30_idxs[0] + threshold_idx else: # fall back to threshold definition if spike doesn't cross -30 mV spk["t_idx_n30"] = threshold_idx spk["t_n30"] = t[spk["t_idx_n30"]] # Figure out initial "slope" of phase plot post-threshold plus_5_vec = np.where(v[threshold_idx:upstroke_idx] >= spk["threshold"] + 5)[0] if len(plus_5_vec) > 0: thresh_plus_5_idx = plus_5_vec[0] + threshold_idx spk["thresh_ramp"] = dvdt[thresh_plus_5_idx] - dvdt[threshold_idx] else: spk["thresh_ramp"] = dvdt[upstroke_idx] - dvdt[threshold_idx] # go forward to determine peak downstroke of spike downstroke_idx = np.argmin(dvdt[peak_idx:next_idx]) + peak_idx spk["downstroke_idx"] = downstroke_idx spk["downstroke_v"] = v[downstroke_idx] spk["downstroke_i"] = curr[downstroke_idx] spk["downstroke_t"] = t[downstroke_idx] spk["downstroke"] = dvdt[downstroke_idx] if np.isnan(spk["downstroke"]): # sometimes dvdt will be NaN because of multiple cvode points at same time step close_idx = downstroke_idx + 1 while (np.isnan(dvdt[close_idx])): close_idx += 1 spk["downstroke"] = dvdt[close_idx] features = {} feature.mean["base_v"] = v[np.where((t > onset - 0.1) & (t < onset - 0.001))].mean() # baseline voltage, 100ms before stim feature.mean["spikes"] = spikes isi_cv = self.isicv(spikes) if isi_cv is not None: feature.mean["ISICV"] = isi_cv n_spikes = len(spikes) feature.mean["n_spikes"] = n_spikes feature.mean["rate"] = 1.0 * n_spikes / (stop - start); feature.mean["adapt"] = self.adaptation_index(spikes, stop) if len(spikes) > 1: feature.mean["doublet"] = 1000 * (spikes[1]["t"] - spikes[0]["t"]) if len(spikes) > 0: for i, spk in enumerate(spikes): idx_next = spikes[i + 1]["t_idx"] if i < len(spikes) - 1 else stop_idx self.calculate_trough(spk, v, curr, t, idx_next) half_max_v = (spk["f_peak"] - spk["f_trough"]) / 2.0 + spk["f_trough"] over_half_max_v_idxs = np.where(v[spk["t_idx"]:spk["trough_idx"]] > half_max_v)[0] if len(over_half_max_v_idxs) > 0: spk["width"] = 1000. * (t[over_half_max_v_idxs[-1] + spk["t_idx"]] - t[over_half_max_v_idxs[0] + spk["t_idx"]]) feature.mean["latency"] = 1000. * (spikes[0]["t"] - onset) feature.mean["latency_n30"] = 1000. * (spikes[0]["t_n30"] - onset) # extract properties for each spike isicnt = 0 isitot = 0 for i in range(0, len(spikes)-1): spk = spikes[i] idx_next = spikes[i+1]["t_idx"] isitot += spikes[i+1]["t"] - spikes[i]["t"] isicnt += 1 if isicnt > 0: feature.mean["isi_avg"] = 1000 * isitot / isicnt else: feature.mean["isi_avg"] = None # average feature data from individual spikes # build superset dictionary of possible features superset = {} for i in range(len(spikes)): for k in spikes[i].keys(): if k not in superset: superset[k] = k for k in superset.keys(): cnt = 0 mean = 0 for i in range(len(spikes)): if k not in spikes[i]: continue mean += float(spikes[i][k]) cnt += 1.0 # this shouldn't be possible, but it may be in future version # so might as well trap for it if cnt == 0: continue mean /= cnt stdev = 0 for i in range(len(spikes)): if k not in spikes[i]: continue dif = mean - float(spikes[i][k]) stdev += dif * dif stdev = math.sqrt(stdev / cnt) feature.mean[k] = mean feature.stdev[k] = stdev # self.feature_list.append(feature) self.feature_source.append(name)
[docs] def isicv(self, spikes): if len(spikes) < 3: return None isi_mean = 0 lst = [] for i in range(len(spikes) - 1): isi = spikes[i+1]["t"] - spikes[i]["t"] #print("\t%g" % isi) isi_mean += isi lst.append(isi) isi_mean /= 1.0 * len(lst) #print(isi_mean) var = 0 for i in range(len(lst)): dif = isi_mean - lst[i] var += dif * dif var /= len(lst) #var /= len(lst) - 1 #print(math.sqrt(var)) if isi_mean > 0: return math.sqrt(var) / isi_mean return None
[docs] def adaptation_index(self, spikes, stim_end): if len(spikes) < 4: return None adi = 0 cnt = 0 isi = [] for i in range(len(spikes)-1): isi.append(spikes[i+1]["t"] - spikes[i]["t"]) # act as though time between last spike and stim end is another ISI per Etay's code # l = stim_end - spikes[-1]["t"] # if l > 0 and l > isi[-1]: # isi.append(l) for i in range(len(isi)-1): adi += 1.0 * (isi[i+1] - isi[i]) / (isi[i+1] + isi[i]) cnt += 1 adi /= cnt return adi
##---------------------------------------------------------------------- # trough (AHP) is presently defined as the minimum voltage level # observed between successive spikes in a burst # there's too much data to cleanly return it on the stack # instead, spike table is passed in instead
[docs] def calculate_trough(self, spike, v, curr, t, next_idx): # dt = t[1] - t[0] peak_idx = spike["peak_idx"] if peak_idx >= next_idx: logging.warning("next index (%d) before peak index (%d) calculating trough" % ( next_idx, peak_idx )) trough_idx = next_idx else: trough_idx = np.argmin(v[peak_idx:next_idx]) + peak_idx spike["trough_idx"] = trough_idx spike["f_trough"] = v[trough_idx] spike["trough_v"] = v[trough_idx] spike["trough_t"] = t[trough_idx] spike["trough_i"] = curr[trough_idx] # calculate etay's 'fast' and 'slow' ahp here if t[peak_idx] + 0.005 >= t[-1]: five_ms_idx = len(t) - 1 else: five_ms_idx = np.where(t >= 0.005 + t[peak_idx])[0][0] # 5ms after peak # fast AHP is minimum value occurring w/in 5ms if five_ms_idx >= next_idx: five_ms_idx = next_idx if peak_idx == five_ms_idx: fast_idx = next_idx else: fast_idx = np.argmin(v[peak_idx:five_ms_idx]) + peak_idx spike["f_fast_ahp"] = v[fast_idx] spike["f_fast_ahp_v"] = v[fast_idx] spike["f_fast_ahp_i"] = curr[fast_idx] spike["f_fast_ahp_t"] = t[fast_idx] if five_ms_idx == next_idx: slow_idx = fast_idx else: slow_idx = np.argmin(v[five_ms_idx:next_idx]) + five_ms_idx spike["f_slow_ahp"] = v[slow_idx] spike["f_slow_ahp_time"] = (t[slow_idx] - t[peak_idx]) / (t[next_idx] - t[peak_idx]) spike["f_slow_ahp_t"] = t[slow_idx]
# initialize summary feature set from file
[docs] def push_summary(self, new_summary): self.summary = new_summary
# calculate nearness score for feature set X relative to summary # the nearness score is the sum of squares the features are from # their target values, in units of standard deviations # when a feature is absent, the algorithm to determine the # penalty is stored in the feature itself, and this value # is calculated then added to the sum
[docs] def score_feature_set(self, set_num): cand = self.feature_list[set_num] scores = [] for k in sorted(self.summary.mean.keys()): if k in self.summary.glossary: response = self.summary.scoring[k]["hit"] if response == "ignore": continue elif response == "stdev": mean = self.summary.mean[k] stdev = self.summary.stdev[k] assert stdev > 0 if k in cand.mean and cand.mean[k] is not None: val = cand.mean[k] inc = abs(mean - val) / stdev scores.append(inc) # print("Hit %s, %g+/-%g (%g) = %g" % (k, mean, stdev, val, inc)) else: resp = cand.scoring[k]["miss"] if resp == "const": miss = float(cand.scoring[k][resp]) elif resp == "mean_mult": miss = mean * float(cand.scoring[k][resp]) else: assert False miss = float(miss) scores.append(miss) # print("Missed %s, penalty = %g" % (k, miss)) elif response == "perspike": mean = self.summary.mean[k] stdev = self.summary.stdev[k] assert stdev > 0 if k in cand.mean and cand.mean[k] is not None: val = 0 n_spikes = len(cand.mean["spikes"]) skip_last_n = self.summary.scoring[k]["skip_last_n"] for spike in cand.mean["spikes"][:n_spikes-skip_last_n]: val += abs(spike[k] - mean) val /= n_spikes - skip_last_n inc = val / stdev scores.append(inc) else: resp = cand.scoring[k]["miss"] if resp == "const": miss = float(cand.scoring[k][resp]) elif resp == "mean_mult": miss = mean * float(cand.scoring[k][resp]) else: assert False miss = float(miss) scores.append(miss) # print("Missed %s, penalty = %g" % (k, miss)) else: assert False if abs(sum(scores)) > 1e10: print(k) print(self.summary.scoring) print(self.summary.mean) print(self.summary.stdev) print(cand.summary.scoring) print(cand.summary.mean) print(cand.summary.stdev) assert False return scores
# create summary of feature instances # 'summary' is an empty feature object. this must be the same # class as the other feature objects that are being summarized
[docs] def summarize(self, summary): if len(self.feature_list) == 0: print("Error -- no features were extracted. Summary impossible") sys.exit() # make dummy dict to verify that all feature instances have # identical features # only copy features that are in the glossary. some are for # internal use (eg, t_idx -- time index) and aren't important # here superset = {} for i in range(len(self.feature_list)): fx = self.feature_list[i] for k in fx.mean.keys(): if k in fx.glossary: superset[k] = k err = 0 for k in superset.keys(): for i in range(len(self.feature_list)): fx = self.feature_list[i].mean if k not in fx: print("Error - feature '%s' not in all data sets" % k) err += 1 if err > 0: return None # all features must be of the same type # to ensure this, make programmer specify the type being summarized self.summary = summary # now set summary means to zero for k in fx.keys(): self.summary.mean[k] = 0 # now calculate the average of all features for i in range(len(self.feature_list)): fx = self.feature_list[i] for k in fx.mean.keys(): if k in fx.glossary and fx.mean[k] is not None: self.summary.mean[k] += fx.mean[k] self.summary.stdev[k] = 0.0 # divide out n to get actual mean for k in fx.mean.keys(): self.summary.mean[k] /= 1.0 * len(self.feature_list) # calculate standard deviation for i in range(len(self.feature_list)): fx = self.feature_list[i] for k in fx.mean.keys(): if k in fx.glossary and fx.mean[k] is not None: mean = self.summary.mean[k] dif = mean - fx.mean[k] self.summary.stdev[k] += dif * dif # divide out n and take sqrt to get actual stdev fx = self.feature_list[0] for k in fx.mean.keys(): if k in fx.glossary and fx.mean[k] is not None: val = self.summary.stdev[k] val /= 1.0 * len(self.feature_list) self.summary.stdev[k] = math.sqrt(val) return self