"SfR Fresh" - the SfR Freeware/Shareware Archive 
Member "spambayes-1.0.4/spambayes/chi2.py" of archive spambayes-1.0.4.zip:
As a special service "SfR Fresh" has tried to format the requested source page into HTML format using source code syntax highlighting with prefixed line numbers.
Alternatively you can here view or download the uninterpreted source code file.
That can be also achieved for any archive member file by clicking within an archive contents listing on the first character of the file(path) respectively on the according byte size field.
1 import math as _math
2
3 try:
4 True, False
5 except NameError:
6 # Maintain compatibility with Python 2.2
7 True, False = 1, 0
8
9
10 def chi2Q(x2, v, exp=_math.exp, min=min):
11 """Return prob(chisq >= x2, with v degrees of freedom).
12
13 v must be even.
14 """
15 assert v & 1 == 0
16 # XXX If x2 is very large, exp(-m) will underflow to 0.
17 m = x2 / 2.0
18 sum = term = exp(-m)
19 for i in range(1, v//2):
20 term *= m / i
21 sum += term
22 # With small x2 and large v, accumulated roundoff error, plus error in
23 # the platform exp(), can cause this to spill a few ULP above 1.0. For
24 # example, chi2Q(100, 300) on my box has sum == 1.0 + 2.0**-52 at this
25 # point. Returning a value even a teensy bit over 1.0 is no good.
26 return min(sum, 1.0)
27
28 def normZ(z, sqrt2pi=_math.sqrt(2.0*_math.pi), exp=_math.exp):
29 "Return value of the unit Gaussian at z."
30 return exp(-z*z/2.0) / sqrt2pi
31
32 def normP(z):
33 """Return area under the unit Gaussian from -inf to z.
34
35 This is the probability that a zscore is <= z.
36 """
37
38 # This is very accurate in a fixed-point sense. For negative z of
39 # large magnitude (<= -8.3), it returns 0.0, essentially because
40 # P(-z) is, to machine precision, indistiguishable from 1.0 then.
41
42 # sum <- area from 0 to abs(z).
43 a = abs(float(z))
44 if a >= 8.3:
45 sum = 0.5
46 else:
47 sum2 = term = a * normZ(a)
48 z2 = a*a
49 sum = 0.0
50 i = 1.0
51 while sum != sum2:
52 sum = sum2
53 i += 2.0
54 term *= z2 / i
55 sum2 += term
56
57 if z >= 0:
58 result = 0.5 + sum
59 else:
60 result = 0.5 - sum
61
62 return result
63
64 def normIQ(p, sqrt=_math.sqrt, ln=_math.log):
65 """Return z such that the area under the unit Gaussian from z to +inf is p.
66
67 Must have 0.0 <= p <= 1.0.
68 """
69
70 assert 0.0 <= p <= 1.0
71 # This is a low-accuracy rational approximation from Abramowitz & Stegun.
72 # The absolute error is bounded by 3e-3.
73
74 flipped = False
75 if p > 0.5:
76 flipped = True
77 p = 1.0 - p
78
79 if p == 0.0:
80 z = 8.3
81 else:
82 t = sqrt(-2.0 * ln(p))
83 z = t - (2.30753 + .27061*t) / (1. + .99229*t + .04481*t**2)
84
85 if flipped:
86 z = -z
87 return z
88
89 def normIP(p):
90 """Return z such that the area under the unit Gaussian from -inf to z is p.
91
92 Must have 0.0 <= p <= 1.0.
93 """
94 z = normIQ(1.0 - p)
95 # One Newton step should double the # of good digits.
96 return z + (p - normP(z)) / normZ(z)
97
98 def main():
99 from spambayes.Histogram import Hist
100 import sys
101
102 class WrappedRandom:
103 # There's no way W-H is equidistributed in 50 dimensions, so use
104 # Marsaglia-wrapping to shuffle it more.
105
106 def __init__(self, baserandom=random.random, tabsize=513):
107 self.baserandom = baserandom
108 self.n = tabsize
109 self.tab = [baserandom() for i in range(tabsize)]
110 self.next = baserandom()
111
112 def random(self):
113 result = self.next
114 i = int(result * self.n)
115 self.next = self.tab[i]
116 self.tab[i] = self.baserandom()
117 return result
118
119 random = WrappedRandom().random
120 #from uni import uni as random
121 #print random
122
123 def judge(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp):
124 H = S = 1.0
125 Hexp = Sexp = 0
126 for p in ps:
127 S *= 1.0 - p
128 H *= p
129 if S < 1e-200:
130 S, e = frexp(S)
131 Sexp += e
132 if H < 1e-200:
133 H, e = frexp(H)
134 Hexp += e
135 S = ln(S) + Sexp * ln2
136 H = ln(H) + Hexp * ln2
137 n = len(ps)
138 S = 1.0 - chi2Q(-2.0 * S, 2*n)
139 H = 1.0 - chi2Q(-2.0 * H, 2*n)
140 return S, H, (S-H + 1.0) / 2.0
141
142 warp = 0
143 bias = 0.99
144 if len(sys.argv) > 1:
145 warp = int(sys.argv[1])
146 if len(sys.argv) > 2:
147 bias = float(sys.argv[2])
148
149 h = Hist(20, lo=0.0, hi=1.0)
150 s = Hist(20, lo=0.0, hi=1.0)
151 score = Hist(20, lo=0.0, hi=1.0)
152
153 for i in range(5000):
154 ps = [random() for j in range(50)]
155 s1, h1, score1 = judge(ps + [bias] * warp)
156 s.add(s1)
157 h.add(h1)
158 score.add(score1)
159
160 print "Result for random vectors of 50 probs, +", warp, "forced to", bias
161
162 # Should be uniformly distributed on all-random data.
163 print
164 print 'H',
165 h.display()
166
167 # Should be uniformly distributed on all-random data.
168 print
169 print 'S',
170 s.display()
171
172 # Distribution doesn't really matter.
173 print
174 print '(S-H+1)/2',
175 score.display()
176
177 def showscore(ps, ln=_math.log, ln2=_math.log(2), frexp=_math.frexp):
178 H = S = 1.0
179 Hexp = Sexp = 0
180 for p in ps:
181 S *= 1.0 - p
182 H *= p
183 if S < 1e-200:
184 S, e = frexp(S)
185 Sexp += e
186 if H < 1e-200:
187 H, e = frexp(H)
188 Hexp += e
189 S = ln(S) + Sexp * ln2
190 H = ln(H) + Hexp * ln2
191
192 n = len(ps)
193 probS = chi2Q(-2*S, 2*n)
194 probH = chi2Q(-2*H, 2*n)
195 print "P(chisq >= %10g | v=%3d) = %10g" % (-2*S, 2*n, probS)
196 print "P(chisq >= %10g | v=%3d) = %10g" % (-2*H, 2*n, probH)
197
198 S = 1.0 - probS
199 H = 1.0 - probH
200 score = (S-H + 1.0) / 2.0
201 print "spam prob", S
202 print " ham prob", H
203 print "(S-H+1)/2", score
204
205 if __name__ == '__main__':
206 import random
207 main()