"SfR Fresh" - the SfR Freeware/Shareware Archive

Member "spambayes-1.0.4/spambayes/chi2.py" of archive spambayes-1.0.4.zip:


As a special service "SfR Fresh" has tried to format the requested source page into HTML format using source code syntax highlighting with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. That can be also achieved for any archive member file by clicking within an archive contents listing on the first character of the file(path) respectively on the according byte size field.
    1 import math as _math
    2 
    3 try:
    4     True, False
    5 except NameError:
    6     # Maintain compatibility with Python 2.2
    7     True, False = 1, 0
    8 
    9 
   10 def chi2Q(x2, v, exp=_math.exp, min=min):
   11     """Return prob(chisq >= x2, with v degrees of freedom).
   12 
   13     v must be even.
   14     """
   15     assert v & 1 == 0
   16     # XXX If x2 is very large, exp(-m) will underflow to 0.
   17     m = x2 / 2.0
   18     sum = term = exp(-m)
   19     for i in range(1, v//2):
   20         term *= m / i
   21         sum += term
   22     # With small x2 and large v, accumulated roundoff error, plus error in
   23     # the platform exp(), can cause this to spill a few ULP above 1.0.  For
   24     # example, chi2Q(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
   25     # point.  Returning a value even a teensy bit over 1.0 is no good.
   26     return min(sum, 1.0)
   27 
   28 def normZ(z, sqrt2pi=_math.sqrt(2.0*_math.pi), exp=_math.exp):
   29     "Return value of the unit Gaussian at z."
   30     return exp(-z*z/2.0) / sqrt2pi
   31 
   32 def normP(z):
   33     """Return area under the unit Gaussian from -inf to z.
   34 
   35     This is the probability that a zscore is <= z.
   36     """
   37 
   38     # This is very accurate in a fixed-point sense.  For negative z of
   39     # large magnitude (<= -8.3), it returns 0.0, essentially because
   40     # P(-z) is, to machine precision, indistiguishable from 1.0 then.
   41 
   42     # sum <- area from 0 to abs(z).
   43     a = abs(float(z))
   44     if a >= 8.3:
   45         sum = 0.5
   46     else:
   47         sum2 = term = a * normZ(a)
   48         z2 = a*a
   49         sum = 0.0
   50         i = 1.0
   51         while sum != sum2:
   52             sum = sum2
   53             i += 2.0
   54             term *= z2 / i
   55             sum2 += term
   56 
   57     if z >= 0:
   58         result = 0.5 + sum
   59     else:
   60         result = 0.5 - sum
   61 
   62     return result
   63 
   64 def normIQ(p, sqrt=_math.sqrt, ln=_math.log):
   65     """Return z such that the area under the unit Gaussian from z to +inf is p.
   66 
   67     Must have 0.0 <= p <= 1.0.
   68     """
   69 
   70     assert 0.0 <= p <= 1.0
   71     # This is a low-accuracy rational approximation from Abramowitz & Stegun.
   72     # The absolute error is bounded by 3e-3.
   73 
   74     flipped = False
   75     if p > 0.5:
   76         flipped = True
   77         p = 1.0 - p
   78 
   79     if p == 0.0:
   80         z = 8.3
   81     else:
   82         t = sqrt(-2.0 * ln(p))
   83         z = t - (2.30753 + .27061*t) / (1. + .99229*t + .04481*t**2)
   84 
   85     if flipped:
   86         z = -z
   87     return z
   88 
   89 def normIP(p):
   90     """Return z such that the area under the unit Gaussian from -inf to z is p.
   91 
   92     Must have 0.0 <= p <= 1.0.
   93     """
   94     z = normIQ(1.0 - p)
   95     # One Newton step should double the # of good digits.
   96     return z + (p - normP(z)) / normZ(z)
   97 
   98 def main():
   99     from spambayes.Histogram import Hist
  100     import sys
  101 
  102     class WrappedRandom:
  103         # There's no way W-H is equidistributed in 50 dimensions, so use
  104         # Marsaglia-wrapping to shuffle it more.
  105 
  106         def __init__(self, baserandom=random.random, tabsize=513):
  107             self.baserandom = baserandom
  108             self.n = tabsize
  109             self.tab = [baserandom() for i in range(tabsize)]
  110             self.next = baserandom()
  111 
  112         def random(self):
  113             result = self.next
  114             i = int(result * self.n)
  115             self.next = self.tab[i]
  116             self.tab[i] = self.baserandom()
  117             return result
  118 
  119     random = WrappedRandom().random
  120     #from uni import uni as random
  121     #print random
  122 
  123     def judge(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp):
  124         H = S = 1.0
  125         Hexp = Sexp = 0
  126         for p in ps:
  127             S *= 1.0 - p
  128             H *= p
  129             if S < 1e-200:
  130                 S, e = frexp(S)
  131                 Sexp += e
  132             if H < 1e-200:
  133                 H, e = frexp(H)
  134                 Hexp += e
  135         S = ln(S) + Sexp * ln2
  136         H = ln(H) + Hexp * ln2
  137         n = len(ps)
  138         S = 1.0 - chi2Q(-2.0 * S, 2*n)
  139         H = 1.0 - chi2Q(-2.0 * H, 2*n)
  140         return S, H, (S-H + 1.0) / 2.0
  141 
  142     warp = 0
  143     bias = 0.99
  144     if len(sys.argv) > 1:
  145         warp = int(sys.argv[1])
  146     if len(sys.argv) > 2:
  147         bias = float(sys.argv[2])
  148 
  149     h = Hist(20, lo=0.0, hi=1.0)
  150     s = Hist(20, lo=0.0, hi=1.0)
  151     score = Hist(20, lo=0.0, hi=1.0)
  152 
  153     for i in range(5000):
  154         ps = [random() for j in range(50)]
  155         s1, h1, score1 = judge(ps + [bias] * warp)
  156         s.add(s1)
  157         h.add(h1)
  158         score.add(score1)
  159 
  160     print "Result for random vectors of 50 probs, +", warp, "forced to", bias
  161 
  162     # Should be uniformly distributed on all-random data.
  163     print
  164     print 'H',
  165     h.display()
  166 
  167     # Should be uniformly distributed on all-random data.
  168     print
  169     print 'S',
  170     s.display()
  171 
  172     # Distribution doesn't really matter.
  173     print
  174     print '(S-H+1)/2',
  175     score.display()
  176 
  177 def showscore(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp):
  178     H = S = 1.0
  179     Hexp = Sexp = 0
  180     for p in ps:
  181         S *= 1.0 - p
  182         H *= p
  183         if S < 1e-200:
  184             S, e = frexp(S)
  185             Sexp += e
  186         if H < 1e-200:
  187             H, e = frexp(H)
  188             Hexp += e
  189     S = ln(S) + Sexp * ln2
  190     H = ln(H) + Hexp * ln2
  191 
  192     n = len(ps)
  193     probS = chi2Q(-2*S, 2*n)
  194     probH = chi2Q(-2*H, 2*n)
  195     print "P(chisq >= %10g | v=%3d) = %10g" % (-2*S, 2*n, probS)
  196     print "P(chisq >= %10g | v=%3d) = %10g" % (-2*H, 2*n, probH)
  197 
  198     S = 1.0 - probS
  199     H = 1.0 - probH
  200     score = (S-H + 1.0) / 2.0
  201     print "spam prob", S
  202     print " ham prob", H
  203     print "(S-H+1)/2", score
  204 
  205 if __name__ == '__main__':
  206     import random
  207     main()