"SfR Fresh" - the SfR Freeware/Shareware Archive 
Member "spambayes-1.0.4/spambayes/classifier.py" of archive spambayes-1.0.4.zip:
As a special service "SfR Fresh" has tried to format the requested source page into HTML format using source code syntax highlighting with prefixed line numbers.
Alternatively you can here view or download the uninterpreted source code file.
That can be also achieved for any archive member file by clicking within an archive contents listing on the first character of the file(path) respectively on the according byte size field.
1 #! /usr/bin/env python
2
3 from __future__ import generators
4
5 # An implementation of a Bayes-like spam classifier.
6 #
7 # Paul Graham's original description:
8 #
9 # http://www.paulgraham.com/spam.html
10 #
11 # A highly fiddled version of that can be retrieved from our CVS repository,
12 # via tag Last-Graham. This made many demonstrated improvements in error
13 # rates over Paul's original description.
14 #
15 # This code implements Gary Robinson's suggestions, the core of which are
16 # well explained on his webpage:
17 #
18 # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
19 #
20 # This is theoretically cleaner, and in testing has performed at least as
21 # well as our highly tuned Graham scheme did, often slightly better, and
22 # sometimes much better. It also has "a middle ground", which people like:
23 # the scores under Paul's scheme were almost always very near 0 or very near
24 # 1, whether or not the classification was correct. The false positives
25 # and false negatives under Gary's basic scheme (use_gary_combining) generally
26 # score in a narrow range around the corpus's best spam_cutoff value.
27 # However, it doesn't appear possible to guess the best spam_cutoff value in
28 # advance, and it's touchy.
29 #
30 # The last version of the Gary-combining scheme can be retrieved from our
31 # CVS repository via tag Last-Gary.
32 #
33 # The chi-combining scheme used by default here gets closer to the theoretical
34 # basis of Gary's combining scheme, and does give extreme scores, but also
35 # has a very useful middle ground (small # of msgs spread across a large range
36 # of scores, and good cutoff values aren't touchy).
37 #
38 # This implementation is due to Tim Peters et alia.
39
40 import math
41 import types
42 try:
43 from sets import Set
44 except ImportError:
45 from spambayes.compatsets import Set
46
47 # XXX At time of writing, these are only necessary for the
48 # XXX experimental url retrieving/slurping code. If that
49 # XXX gets ripped out, either rip these out, or run
50 # XXX PyChecker over the code.
51 import re
52 import os
53 import sys
54 import socket
55 import pickle
56 import urllib2
57 from email import message_from_string
58
59 try:
60 enumerate
61 except NameError:
62 def enumerate(seq):
63 i = 0
64 for elt in seq:
65 yield (i, elt)
66 i += 1
67
68 DOMAIN_AND_PORT_RE = re.compile(r"([^:/\\]+)(:([\d]+))?")
69 HTTP_ERROR_RE = re.compile(r"HTTP Error ([\d]+)")
70 URL_KEY_RE = re.compile(r"[\W]")
71 # XXX ---- ends ----
72
73 from spambayes.Options import options
74 from spambayes.chi2 import chi2Q
75
76 try:
77 True, False
78 except NameError:
79 # Maintain compatibility with Python 2.2
80 True, False = 1, 0
81
82
83 LN2 = math.log(2) # used frequently by chi-combining
84
85 slurp_wordstream = None
86
87 PICKLE_VERSION = 5
88
89 class WordInfo(object):
90 # A WordInfo is created for each distinct word. spamcount is the
91 # number of trained spam msgs in which the word appears, and hamcount
92 # the number of trained ham msgs.
93 #
94 # Invariant: For use in a classifier database, at least one of
95 # spamcount and hamcount must be non-zero.
96 #
97 # Important: This is a tiny object. Use of __slots__ is essential
98 # to conserve memory.
99 __slots__ = 'spamcount', 'hamcount'
100
101 def __init__(self):
102 self.__setstate__((0, 0))
103
104 def __repr__(self):
105 return "WordInfo" + repr((self.spamcount, self.hamcount))
106
107 def __getstate__(self):
108 return self.spamcount, self.hamcount
109
110 def __setstate__(self, t):
111 self.spamcount, self.hamcount = t
112
113
114 class Classifier:
115 # Defining __slots__ here made Jeremy's life needlessly difficult when
116 # trying to hook this all up to ZODB as a persistent object. There's
117 # no space benefit worth getting from slots in this class; slots were
118 # used solely to help catch errors earlier, when this code was changing
119 # rapidly.
120
121 #__slots__ = ('wordinfo', # map word to WordInfo record
122 # 'nspam', # number of spam messages learn() has seen
123 # 'nham', # number of non-spam messages learn() has seen
124 # )
125
126 # allow a subclass to use a different class for WordInfo
127 WordInfoClass = WordInfo
128
129 def __init__(self):
130 self.wordinfo = {}
131 self.probcache = {}
132 self.nspam = self.nham = 0
133
134 def __getstate__(self):
135 return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)
136
137 def __setstate__(self, t):
138 if t[0] != PICKLE_VERSION:
139 raise ValueError("Can't unpickle -- version %s unknown" % t[0])
140 (self.wordinfo, self.nspam, self.nham) = t[1:]
141 self.probcache = {}
142
143 # spamprob() implementations. One of the following is aliased to
144 # spamprob, depending on option settings.
145 # Currently only chi-squared is available, but maybe there will be
146 # an alternative again someday.
147
148 # Across vectors of length n, containing random uniformly-distributed
149 # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution
150 # with 2*n degrees of freedom. This has been proven (in some
151 # appropriate sense) to be the most sensitive possible test for
152 # rejecting the hypothesis that a vector of probabilities is uniformly
153 # distributed. Gary Robinson's original scheme was monotonic *with*
154 # this test, but skipped the details. Turns out that getting closer
155 # to the theoretical roots gives a much sharper classification, with
156 # a very small (in # of msgs), but also very broad (in range of scores),
157 # "middle ground", where most of the mistakes live. In particular,
158 # this scheme seems immune to all forms of "cancellation disease": if
159 # there are many strong ham *and* spam clues, this reliably scores
160 # close to 0.5. Most other schemes are extremely certain then -- and
161 # often wrong.
162 def chi2_spamprob(self, wordstream, evidence=False):
163 """Return best-guess probability that wordstream is spam.
164
165 wordstream is an iterable object producing words.
166 The return value is a float in [0.0, 1.0].
167
168 If optional arg evidence is True, the return value is a pair
169 probability, evidence
170 where evidence is a list of (word, probability) pairs.
171 """
172
173 from math import frexp, log as ln
174
175 # We compute two chi-squared statistics, one for ham and one for
176 # spam. The sum-of-the-logs business is more sensitive to probs
177 # near 0 than to probs near 1, so the spam measure uses 1-p (so
178 # that high-spamprob words have greatest effect), and the ham
179 # measure uses p directly (so that lo-spamprob words have greatest
180 # effect).
181 #
182 # For optimization, sum-of-logs == log-of-product, and f.p.
183 # multiplication is a lot cheaper than calling ln(). It's easy
184 # to underflow to 0.0, though, so we simulate unbounded dynamic
185 # range via frexp. The real product H = this H * 2**Hexp, and
186 # likewise the real product S = this S * 2**Sexp.
187 H = S = 1.0
188 Hexp = Sexp = 0
189
190 clues = self._getclues(wordstream)
191 for prob, word, record in clues:
192 S *= 1.0 - prob
193 H *= prob
194 if S < 1e-200: # prevent underflow
195 S, e = frexp(S)
196 Sexp += e
197 if H < 1e-200: # prevent underflow
198 H, e = frexp(H)
199 Hexp += e
200
201 # Compute the natural log of the product = sum of the logs:
202 # ln(x * 2**i) = ln(x) + i * ln(2).
203 S = ln(S) + Sexp * LN2
204 H = ln(H) + Hexp * LN2
205
206 n = len(clues)
207 if n:
208 S = 1.0 - chi2Q(-2.0 * S, 2*n)
209 H = 1.0 - chi2Q(-2.0 * H, 2*n)
210
211 # How to combine these into a single spam score? We originally
212 # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H). A
213 # systematic problem is that we could end up being near-certain
214 # a thing was (for example) spam, even if S was small, provided
215 # that H was much smaller.
216 # Rob Hooft stared at these problems and invented the measure
217 # we use now, the simpler S-H, scaled into [0., 1.].
218 prob = (S-H + 1.0) / 2.0
219 else:
220 prob = 0.5
221
222 if evidence:
223 clues = [(w, p) for p, w, r in clues]
224 clues.sort(lambda a, b: cmp(a[1], b[1]))
225 clues.insert(0, ('*S*', S))
226 clues.insert(0, ('*H*', H))
227 return prob, clues
228 else:
229 return prob
230
231 def slurping_spamprob(self, wordstream, evidence=False):
232 """Do the standard chi-squared spamprob, but if the evidence
233 leaves the score in the unsure range, and we have fewer tokens
234 than max_discriminators, also generate tokens from the text
235 obtained by following http URLs in the message."""
236 h_cut = options["Categorization", "ham_cutoff"]
237 s_cut = options["Categorization", "spam_cutoff"]
238
239 # Get the raw score.
240 prob, clues = self.chi2_spamprob(wordstream, True)
241
242 # If necessary, enhance it with the tokens from whatever is
243 # at the URL's destination.
244 if len(clues) < options["Classifier", "max_discriminators"] and \
245 prob > h_cut and prob < s_cut and slurp_wordstream:
246 slurp_tokens = list(self._generate_slurp())
247 slurp_tokens.extend([w for (w,p) in clues])
248 sprob, sclues = self.chi2_spamprob(slurp_tokens, True)
249 if sprob < h_cut or sprob > s_cut:
250 prob = sprob
251 clues = sclues
252 if evidence:
253 return prob, clues
254 return prob
255
256 if options["Classifier", "use_chi_squared_combining"]:
257 if options["URLRetriever", "x-slurp_urls"]:
258 spamprob = slurping_spamprob
259 else:
260 spamprob = chi2_spamprob
261
262 def learn(self, wordstream, is_spam):
263 """Teach the classifier by example.
264
265 wordstream is a word stream representing a message. If is_spam is
266 True, you're telling the classifier this message is definitely spam,
267 else that it's definitely not spam.
268 """
269 if options["Classifier", "x-use_bigrams"]:
270 wordstream = self._enhance_wordstream(wordstream)
271 if options["URLRetriever", "x-slurp_urls"]:
272 wordstream = self._add_slurped(wordstream)
273 self._add_msg(wordstream, is_spam)
274
275 def unlearn(self, wordstream, is_spam):
276 """In case of pilot error, call unlearn ASAP after screwing up.
277
278 Pass the same arguments you passed to learn().
279 """
280 if options["Classifier", "x-use_bigrams"]:
281 wordstream = self._enhance_wordstream(wordstream)
282 if options["URLRetriever", "x-slurp_urls"]:
283 wordstream = self._add_slurped(wordstream)
284 self._remove_msg(wordstream, is_spam)
285
286 def probability(self, record):
287 """Compute, store, and return prob(msg is spam | msg contains word).
288
289 This is the Graham calculation, but stripped of biases, and
290 stripped of clamping into 0.01 thru 0.99. The Bayesian
291 adjustment following keeps them in a sane range, and one
292 that naturally grows the more evidence there is to back up
293 a probability.
294 """
295
296 spamcount = record.spamcount
297 hamcount = record.hamcount
298
299 # Try the cache first
300 try:
301 return self.probcache[spamcount][hamcount]
302 except KeyError:
303 pass
304
305 nham = float(self.nham or 1)
306 nspam = float(self.nspam or 1)
307
308 assert hamcount <= nham
309 hamratio = hamcount / nham
310
311 assert spamcount <= nspam
312 spamratio = spamcount / nspam
313
314 prob = spamratio / (hamratio + spamratio)
315
316 S = options["Classifier", "unknown_word_strength"]
317 StimesX = S * options["Classifier", "unknown_word_prob"]
318
319
320 # Now do Robinson's Bayesian adjustment.
321 #
322 # s*x + n*p(w)
323 # f(w) = --------------
324 # s + n
325 #
326 # I find this easier to reason about like so (equivalent when
327 # s != 0):
328 #
329 # x - p
330 # p + -------
331 # 1 + n/s
332 #
333 # IOW, it moves p a fraction of the distance from p to x, and
334 # less so the larger n is, or the smaller s is.
335
336 n = hamcount + spamcount
337 prob = (StimesX + n * prob) / (S + n)
338
339 # Update the cache
340 try:
341 self.probcache[spamcount][hamcount] = prob
342 except KeyError:
343 self.probcache[spamcount] = {hamcount: prob}
344
345 return prob
346
347 # NOTE: Graham's scheme had a strange asymmetry: when a word appeared
348 # n>1 times in a single message, training added n to the word's hamcount
349 # or spamcount, but predicting scored words only once. Tests showed
350 # that adding only 1 in training, or scoring more than once when
351 # predicting, hurt under the Graham scheme.
352 # This isn't so under Robinson's scheme, though: results improve
353 # if training also counts a word only once. The mean ham score decreases
354 # significantly and consistently, ham score variance decreases likewise,
355 # mean spam score decreases (but less than mean ham score, so the spread
356 # increases), and spam score variance increases.
357 # I (Tim) speculate that adding n times under the Graham scheme helped
358 # because it acted against the various ham biases, giving frequently
359 # repeated spam words (like "Viagra") a quick ramp-up in spamprob; else,
360 # adding only once in training, a word like that was simply ignored until
361 # it appeared in 5 distinct training spams. Without the ham-favoring
362 # biases, though, and never ignoring words, counting n times introduces
363 # a subtle and unhelpful bias.
364 # There does appear to be some useful info in how many times a word
365 # appears in a msg, but distorting spamprob doesn't appear a correct way
366 # to exploit it.
367 def _add_msg(self, wordstream, is_spam):
368 self.probcache = {} # nuke the prob cache
369 if is_spam:
370 self.nspam += 1
371 else:
372 self.nham += 1
373
374 for word in Set(wordstream):
375 record = self._wordinfoget(word)
376 if record is None:
377 record = self.WordInfoClass()
378
379 if is_spam:
380 record.spamcount += 1
381 else:
382 record.hamcount += 1
383
384 self._wordinfoset(word, record)
385
386 self._post_training()
387
388 def _remove_msg(self, wordstream, is_spam):
389 self.probcache = {} # nuke the prob cache
390 if is_spam:
391 if self.nspam <= 0:
392 raise ValueError("spam count would go negative!")
393 self.nspam -= 1
394 else:
395 if self.nham <= 0:
396 raise ValueError("non-spam count would go negative!")
397 self.nham -= 1
398
399 for word in Set(wordstream):
400 record = self._wordinfoget(word)
401 if record is not None:
402 if is_spam:
403 if record.spamcount > 0:
404 record.spamcount -= 1
405 else:
406 if record.hamcount > 0:
407 record.hamcount -= 1
408 if record.hamcount == 0 == record.spamcount:
409 self._wordinfodel(word)
410 else:
411 self._wordinfoset(word, record)
412
413 self._post_training()
414
415 def _post_training(self):
416 """This is called after training on a wordstream. Subclasses might
417 want to ensure that their databases are in a consistent state at
418 this point. Introduced to fix bug #797890."""
419 pass
420
421 # Return list of (prob, word, record) triples, sorted by increasing
422 # prob. "word" is a token from wordstream; "prob" is its spamprob (a
423 # float in 0.0 through 1.0); and "record" is word's associated
424 # WordInfo record if word is in the training database, or None if it's
425 # not. No more than max_discriminators items are returned, and have
426 # the strongest (farthest from 0.5) spamprobs of all tokens in wordstream.
427 # Tokens with spamprobs less than minimum_prob_strength away from 0.5
428 # aren't returned.
429 def _getclues(self, wordstream):
430 mindist = options["Classifier", "minimum_prob_strength"]
431
432 if options["Classifier", "x-use_bigrams"]:
433 # This scheme mixes single tokens with pairs of adjacent tokens.
434 # wordstream is "tiled" into non-overlapping unigrams and
435 # bigrams. Non-overlap is important to prevent a single original
436 # token from contributing to more than one spamprob returned
437 # (systematic correlation probably isn't a good thing).
438
439 # First fill list raw with
440 # (distance, prob, word, record), indices
441 # pairs, one for each unigram and bigram in wordstream.
442 # indices is a tuple containing the indices (0-based relative to
443 # the start of wordstream) of the tokens that went into word.
444 # indices is a 1-tuple for an original token, and a 2-tuple for
445 # a synthesized bigram token. The indices are needed to detect
446 # overlap later.
447 raw = []
448 push = raw.append
449 pair = None
450 # Keep track of which tokens we've already seen.
451 # Don't use a Set here! This is an innermost loop, so speed is
452 # important here (direct dict fiddling is much quicker than
453 # invoking Python-level Set methods; in Python 2.4 that will
454 # change).
455 seen = {pair: 1} # so the bigram token is skipped on 1st loop trip
456 for i, token in enumerate(wordstream):
457 if i: # not the 1st loop trip, so there is a preceding token
458 # This string interpolation must match the one in
459 # _enhance_wordstream().
460 pair = "bi:%s %s" % (last_token, token)
461 last_token = token
462 for clue, indices in (token, (i,)), (pair, (i-1, i)):
463 if clue not in seen: # as always, skip duplicates
464 seen[clue] = 1
465 tup = self._worddistanceget(clue)
466 if tup[0] >= mindist:
467 push((tup, indices))
468
469 # Sort raw, strongest to weakest spamprob.
470 raw.sort()
471 raw.reverse()
472 # Fill clues with the strongest non-overlapping clues.
473 clues = []
474 push = clues.append
475 # Keep track of which indices have already contributed to a
476 # clue in clues.
477 seen = {}
478 for tup, indices in raw:
479 overlap = [i for i in indices if i in seen]
480 if not overlap: # no overlap with anything already in clues
481 for i in indices:
482 seen[i] = 1
483 push(tup)
484 # Leave sorted from smallest to largest spamprob.
485 clues.reverse()
486
487 else:
488 # The all-unigram scheme just scores the tokens as-is. A Set()
489 # is used to weed out duplicates at high speed.
490 clues = []
491 push = clues.append
492 for word in Set(wordstream):
493 tup = self._worddistanceget(word)
494 if tup[0] >= mindist:
495 push(tup)
496 clues.sort()
497
498 if len(clues) > options["Classifier", "max_discriminators"]:
499 del clues[0 : -options["Classifier", "max_discriminators"]]
500 # Return (prob, word, record).
501 return [t[1:] for t in clues]
502
503 def _worddistanceget(self, word):
504 record = self._wordinfoget(word)
505 if record is None:
506 prob = options["Classifier", "unknown_word_prob"]
507 else:
508 prob = self.probability(record)
509 distance = abs(prob - 0.5)
510 return distance, prob, word, record
511
512 def _wordinfoget(self, word):
513 return self.wordinfo.get(word)
514
515 def _wordinfoset(self, word, record):
516 self.wordinfo[word] = record
517
518 def _wordinfodel(self, word):
519 del self.wordinfo[word]
520
521 def _enhance_wordstream(self, wordstream):
522 """Add bigrams to the wordstream.
523
524 For example, a b c -> a b "a b" c "b c"
525
526 Note that these are *token* bigrams, and not *word* bigrams - i.e.
527 'synthetic' tokens get bigram'ed, too.
528
529 The bigram token is simply "bi:unigram1 unigram2" - a space should
530 be sufficient as a separator, since spaces aren't in any other
531 tokens, apart from 'synthetic' ones. The "bi:" prefix is added
532 to avoid conflict with tokens we generate (like "subject: word",
533 which could be "word" in a subject, or a bigram of "subject:" and
534 "word").
535
536 If the experimental "Classifier":"x-use_bigrams" option is
537 removed, this function can be removed, too.
538 """
539
540 last = None
541 for token in wordstream:
542 yield token
543 if last:
544 # This string interpolation must match the one in
545 # _getclues().
546 yield "bi:%s %s" % (last, token)
547 last = token
548
549 def _generate_slurp(self):
550 # We don't want to do this recursively and check URLs
551 # on webpages, so we have this little cheat.
552 if not hasattr(self, "setup_done"):
553 self.setup()
554 self.setup_done = True
555 if not hasattr(self, "do_slurp") or self.do_slurp:
556 if slurp_wordstream:
557 self.do_slurp = False
558
559 tokens = self.slurp(*slurp_wordstream)
560 self.do_slurp = True
561 self._save_caches()
562 return tokens
563 return []
564
565 def setup(self):
566 # Can't import this at the top because it's circular.
567 # XXX Someone smarter than me, please figure out the right
568 # XXX way to do this.
569 from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory
570
571 username = options["globals", "proxy_username"]
572 password = options["globals", "proxy_password"]
573 server = options["globals", "proxy_server"]
574 if server.find(":") != -1:
575 server, port = server.split(':', 1)
576 else:
577 port = 8080
578 if server:
579 # Build a new opener that uses a proxy requiring authorization
580 proxy_support = urllib2.ProxyHandler({"http" : \
581 "http://%s:%s@%s:%d" % \
582 (username, password,
583 server, port)})
584 opener = urllib2.build_opener(proxy_support,
585 urllib2.HTTPHandler)
586 else:
587 # Build a new opener without any proxy information.
588 opener = urllib2.build_opener(urllib2.HTTPHandler)
589
590 # Install it
591 urllib2.install_opener(opener)
592
593 # Setup the cache for retrieved urls
594 age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
595 dir = options["URLRetriever", "x-cache_directory"]
596 if not os.path.exists(dir):
597 # Create the directory.
598 if options["globals", "verbose"]:
599 print >>sys.stderr, "Creating URL cache directory"
600 os.makedirs(dir)
601
602 self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
603 dir, cacheSize=20)
604 # Kill any old information in the cache
605 self.urlCorpus.removeExpiredMessages()
606
607 # Setup caches for unretrievable urls
608 self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
609 self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
610 if os.path.exists(self.bad_url_cache_name):
611 b_file = file(self.bad_url_cache_name, "r")
612 try:
613 self.bad_urls = pickle.load(b_file)
614 except IOError, ValueError:
615 # Something went wrong loading it (bad pickle,
616 # probably). Start afresh.
617 if options["globals", "verbose"]:
618 print >>sys.stderr, "Bad URL pickle, using new."
619 self.bad_urls = {"url:non_resolving": (),
620 "url:non_html": (),
621 "url:unknown_error": ()}
622 b_file.close()
623 else:
624 if options["globals", "verbose"]:
625 print "URL caches don't exist: creating"
626 self.bad_urls = {"url:non_resolving": (),
627 "url:non_html": (),
628 "url:unknown_error": ()}
629 if os.path.exists(self.http_error_cache_name):
630 h_file = file(self.http_error_cache_name, "r")
631 try:
632 self.http_error_urls = pickle.load(h_file)
633 except IOError, ValueError:
634 # Something went wrong loading it (bad pickle,
635 # probably). Start afresh.
636 if options["globals", "verbose"]:
637 print >>sys.stderr, "Bad HHTP error pickle, using new."
638 self.http_error_urls = {}
639 h_file.close()
640 else:
641 self.http_error_urls = {}
642
643 def _save_caches(self):
644 # XXX Note that these caches are never refreshed, which might not
645 # XXX be a good thing long-term (if a previously invalid URL
646 # XXX becomes valid, for example).
647 for name, data in [(self.bad_url_cache_name, self.bad_urls),
648 (self.http_error_cache_name, self.http_error_urls),]:
649 # Save to a temp file first, in case something goes wrong.
650 cache = open(name + ".tmp", "w")
651 pickle.dump(data, cache)
652 cache.close()
653 try:
654 os.rename(name + ".tmp", name)
655 except OSError:
656 # Atomic replace isn't possible with win32, so just
657 # remove and rename.
658 os.remove(name)
659 os.rename(name + ".tmp", name)
660
661 def slurp(self, proto, url):
662 # We generate these tokens:
663 # url:non_resolving
664 # url:non_html
665 # url:http_XXX (for each type of http error encounted,
666 # for example 404, 403, ...)
667 # And tokenise the received page (but we do not slurp this).
668 # Actually, the special url: tokens barely showed up in my testing,
669 # although I would have thought that they would more - this might
670 # be due to an error, although they do turn up on occasion. In
671 # any case, we have to do the test, so generating an extra token
672 # doesn't cost us anything apart from another entry in the db, and
673 # it's only two entries, plus one for each type of http error
674 # encountered, so it's pretty neglible.
675 from spambayes.tokenizer import Tokenizer
676
677 if options["URLRetriever", "x-only_slurp_base"]:
678 url = self._base_url(url)
679
680 # Check the unretrievable caches
681 for err in self.bad_urls.keys():
682 if url in self.bad_urls[err]:
683 return [err]
684 if self.http_error_urls.has_key(url):
685 return self.http_error_urls[url]
686
687 # We check if the url will resolve first
688 mo = DOMAIN_AND_PORT_RE.match(url)
689 domain = mo.group(1)
690 if mo.group(3) is None:
691 port = 80
692 else:
693 port = mo.group(3)
694 try:
695 not_used = socket.getaddrinfo(domain, port)
696 except socket.error:
697 self.bad_urls["url:non_resolving"] += (url,)
698 return ["url:non_resolving"]
699
700 # If the message is in our cache, then we can just skip over
701 # retrieving it from the network, and get it from there, instead.
702 url_key = URL_KEY_RE.sub('_', url)
703 cached_message = self.urlCorpus.get(url_key)
704
705 if cached_message is None:
706 # We're going to ignore everything that isn't text/html,
707 # so we might as well not bother retrieving anything with
708 # these extensions.
709 parts = url.split('.')
710 if parts[-1] in ('jpg', 'gif', 'png', 'css', 'js'):
711 self.bad_urls["url:non_html"] += (url,)
712 return ["url:non_html"]
713
714 try:
715 if options["globals", "verbose"]:
716 print >>sys.stderr, "Slurping", url
717 f = urllib2.urlopen("%s://%s" % (proto, url))
718 except (urllib2.URLError, socket.error), details:
719 mo = HTTP_ERROR_RE.match(str(details))
720 if mo:
721 self.http_error_urls[url] = "url:http_" + mo.group(1)
722 return ["url:http_" + mo.group(1)]
723 self.bad_urls["url:unknown_error"] += (url,)
724 return ["url:unknown_error"]
725
726 try:
727 # Anything that isn't text/html is ignored
728 content_type = f.info().get('content-type')
729 if content_type is None or \
730 not content_type.startswith("text/html"):
731 self.bad_urls["url:non_html"] += (url,)
732 return ["url:non_html"]
733
734 page = f.read()
735 headers = str(f.info())
736 f.close()
737 except socket.error:
738 # This is probably a temporary error, like a timeout.
739 # For now, just bail out.
740 return []
741
742 fake_message_string = headers + "\r\n" + page
743
744 # Retrieving the same messages over and over again will tire
745 # us out, so we store them in our own wee cache.
746 message = self.urlCorpus.makeMessage(url_key)
747 message.setPayload(fake_message_string)
748 self.urlCorpus.addMessage(message)
749 else:
750 fake_message_string = cached_message.as_string()
751
752 msg = message_from_string(fake_message_string)
753
754 # We don't want to do full header tokenising, as this is
755 # optimised for messages, not webpages, so we just do the
756 # basic stuff.
757 bht = options["Tokenizer", "basic_header_tokenize"]
758 bhto = options["Tokenizer", "basic_header_tokenize_only"]
759 options["Tokenizer", "basic_header_tokenize"] = True
760 options["Tokenizer", "basic_header_tokenize_only"] = True
761
762 tokens = Tokenizer().tokenize(msg)
763 pf = options["URLRetriever", "x-web_prefix"]
764 tokens = ["%s%s" % (pf, tok) for tok in tokens]
765
766 # Undo the changes
767 options["Tokenizer", "basic_header_tokenize"] = bht
768 options["Tokenizer", "basic_header_tokenize_only"] = bhto
769 return tokens
770
771 def _base_url(self, url):
772 # To try and speed things up, and to avoid following
773 # unique URLS, we convert the URL to as basic a form
774 # as we can - so http://www.massey.ac.nz/~tameyer/index.html?you=me
775 # would become http://massey.ac.nz and http://id.example.com
776 # would become http://example.com
777 url += '/'
778 domain, garbage = url.split('/', 1)
779 parts = domain.split('.')
780 if len(parts) > 2:
781 base_domain = parts[-2] + '.' + parts[-1]
782 if len(parts[-1]) < 3:
783 base_domain = parts[-3] + '.' + base_domain
784 else:
785 base_domain = domain
786 return base_domain
787
788 def _add_slurped(self, wordstream):
789 """Add tokens generated by 'slurping' (i.e. tokenizing
790 the text at the web pages pointed to by URLs in messages)
791 to the wordstream."""
792 for token in wordstream:
793 yield token
794 slurped_tokens = self._generate_slurp()
795 for token in slurped_tokens:
796 yield token
797
798 def _wordinfokeys(self):
799 return self.wordinfo.keys()
800
801
802 Bayes = Classifier