#encoding: utf-8
"""
tools.stats -- Toolbox functions for computing statistics
Exported namespace: bootstrap, smooth_pdf, integer_hist
Written by Joe Monaco
Center for Theoretical Neuroscience
Copyright (c) 2007-2008 Columbia Unversity. All Rights Reserved.
"""
import os as _os
import numpy as _N
from sys import platform as _plat, stderr as _err
def bootstrap(X, N, H0, *args):
"""Get a one-tail p-value for an algorithmic sampling process
H0(*args) must return a scalar null sample value.
The sign of the returned p-value indicates whether X is less than (-) or
greater than (+) the median of the sample distribution.
Arguments:
X -- the value for which to return a p-value
N -- sampling size of the empirical distribution (beware O(n))
H0 -- function that implements sampling process for the null result
args -- additional arguments will be passed to H0
"""
assert callable(H0), 'H0 must be a callable that returns a scalar sample'
tail = 0
for i in xrange(N):
tail += int(H0(*args) >= X)
if tail > float(N)/2:
tail = tail - N # negative p-value for X less than median
if not tail:
_err.write('warning: bootstrap needs N > %d; returning upper bound\n'%N)
tail = 1
return tail / float(N)
def smooth_pdf(a, sd=None):
"""Get a smoothed pdf of an array of data for visualization
Keyword arguments:
sd -- S.D. of the gaussian kernel used to perform the smoothing (default is
1/20 of the data range)
Return 2-row (x, pdf(x)) smoothed probability density estimate.
"""
from scipy.signal import gaussian, convolve
from numpy import array, arange, cumsum, trapz, histogram, diff, r_, c_
if sd is None:
sd = 0.05 * a.ptp()
data = a.copy().flatten() # get 1D copy of array data
nbins = len(data) > 1000 and len(data) or 1000 # num bins >~ O(len(data))
f, l = histogram(data, bins=nbins, normed=True) # fine pdf
sd_bins = sd * (float(nbins) / a.ptp()) # convert sd to bin units
kern_size = int(10*sd_bins) # sufficient convolution kernel size
g = gaussian(kern_size, sd_bins) # generate smoothing kernel
c = cumsum(f, dtype='d') # raw fine-grained cdf
cext = r_[array((0,)*(2*kern_size), 'd'), c,
array((c[-1],)*(2*kern_size), 'd')] # wrap data to kill boundary effect
cs = convolve(cext, g, mode='same') # smooth the extended cdf
ps = diff(cs) # differentiate smooth cdf to get smooth pdf
dl = l[1]-l[0] # get bin delta
l = r_[arange(l[0]-kern_size*dl, l[0], dl), l,
arange(l[-1]+dl, l[-1]+kern_size*dl, dl)] # pad index to match bounds
ps = ps[kern_size:kern_size+len(l)] # crop pdf to same length as index
ps /= trapz(ps, x=l) # normalize pdf integral to unity
return c_[l, ps].T # return 2-row concatenation of x and pdf(x)
def integer_hist(a, relative=False):
"""Get histogram data using integer bins
Parameters:
a -- the data to be histogrammed (ndim > 1 is flattened)
relative -- whether count should be relative frequency or raw counts
Returns (center, count):
center -- integer bin centers for the histogram
count -- bin frequencies, whether relative frequency or raw count
"""
data = a.round().flatten()
center = _N.arange(int(a.min()), int(a.max())+1)
if relative:
count = _N.empty(center.shape[0], 'd')
else:
count = _N.empty(center.shape[0], 'l')
for bin, c in enumerate(center):
count[bin] = (data == c).sum()
if relative:
count /= count.sum()
return center, count