"SfR Fresh" - the SfR Freeware/Shareware Archive

Member "spambayes-1.0.4/spambayes/classifier.py" of archive spambayes-1.0.4.zip:


As a special service "SfR Fresh" has tried to format the requested source page into HTML format using source code syntax highlighting with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. That can be also achieved for any archive member file by clicking within an archive contents listing on the first character of the file(path) respectively on the according byte size field.
    1 #! /usr/bin/env python
    2 
    3 from __future__ import generators
    4 
    5 # An implementation of a Bayes-like spam classifier.
    6 #
    7 # Paul Graham's original description:
    8 #
    9 #     http://www.paulgraham.com/spam.html
   10 #
   11 # A highly fiddled version of that can be retrieved from our CVS repository,
   12 # via tag Last-Graham.  This made many demonstrated improvements in error
   13 # rates over Paul's original description.
   14 #
   15 # This code implements Gary Robinson's suggestions, the core of which are
   16 # well explained on his webpage:
   17 #
   18 #    http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
   19 #
   20 # This is theoretically cleaner, and in testing has performed at least as
   21 # well as our highly tuned Graham scheme did, often slightly better, and
   22 # sometimes much better.  It also has "a middle ground", which people like:
   23 # the scores under Paul's scheme were almost always very near 0 or very near
   24 # 1, whether or not the classification was correct.  The false positives
   25 # and false negatives under Gary's basic scheme (use_gary_combining) generally
   26 # score in a narrow range around the corpus's best spam_cutoff value.
   27 # However, it doesn't appear possible to guess the best spam_cutoff value in
   28 # advance, and it's touchy.
   29 #
   30 # The last version of the Gary-combining scheme can be retrieved from our
   31 # CVS repository via tag Last-Gary.
   32 #
   33 # The chi-combining scheme used by default here gets closer to the theoretical
   34 # basis of Gary's combining scheme, and does give extreme scores, but also
   35 # has a very useful middle ground (small # of msgs spread across a large range
   36 # of scores, and good cutoff values aren't touchy).
   37 #
   38 # This implementation is due to Tim Peters et alia.
   39 
   40 import math
   41 import types
   42 try:
   43     from sets import Set
   44 except ImportError:
   45     from spambayes.compatsets import Set
   46 
   47 # XXX At time of writing, these are only necessary for the
   48 # XXX experimental url retrieving/slurping code.  If that
   49 # XXX gets ripped out, either rip these out, or run
   50 # XXX PyChecker over the code.
   51 import re
   52 import os
   53 import sys
   54 import socket
   55 import pickle
   56 import urllib2
   57 from email import message_from_string
   58 
   59 try:
   60     enumerate
   61 except NameError:
   62     def enumerate(seq):
   63         i = 0
   64         for elt in seq:
   65             yield (i, elt)
   66             i += 1
   67 
   68 DOMAIN_AND_PORT_RE = re.compile(r"([^:/\\]+)(:([\d]+))?")
   69 HTTP_ERROR_RE = re.compile(r"HTTP Error ([\d]+)")
   70 URL_KEY_RE = re.compile(r"[\W]")
   71 # XXX ---- ends ----
   72 
   73 from spambayes.Options import options
   74 from spambayes.chi2 import chi2Q
   75 
   76 try:
   77     True, False
   78 except NameError:
   79     # Maintain compatibility with Python 2.2
   80     True, False = 1, 0
   81 
   82 
   83 LN2 = math.log(2)       # used frequently by chi-combining
   84 
   85 slurp_wordstream = None
   86 
   87 PICKLE_VERSION = 5
   88 
   89 class WordInfo(object):
   90     # A WordInfo is created for each distinct word.  spamcount is the
   91     # number of trained spam msgs in which the word appears, and hamcount
   92     # the number of trained ham msgs.
   93     #
   94     # Invariant:  For use in a classifier database, at least one of
   95     # spamcount and hamcount must be non-zero.
   96     #
   97     # Important:  This is a tiny object.  Use of __slots__ is essential
   98     # to conserve memory.
   99     __slots__ = 'spamcount', 'hamcount'
  100 
  101     def __init__(self):
  102         self.__setstate__((0, 0))
  103 
  104     def __repr__(self):
  105         return "WordInfo" + repr((self.spamcount, self.hamcount))
  106 
  107     def __getstate__(self):
  108         return self.spamcount, self.hamcount
  109 
  110     def __setstate__(self, t):
  111         self.spamcount, self.hamcount = t
  112 
  113 
  114 class Classifier:
  115     # Defining __slots__ here made Jeremy's life needlessly difficult when
  116     # trying to hook this all up to ZODB as a persistent object.  There's
  117     # no space benefit worth getting from slots in this class; slots were
  118     # used solely to help catch errors earlier, when this code was changing
  119     # rapidly.
  120 
  121     #__slots__ = ('wordinfo',  # map word to WordInfo record
  122     #             'nspam',     # number of spam messages learn() has seen
  123     #             'nham',      # number of non-spam messages learn() has seen
  124     #            )
  125 
  126     # allow a subclass to use a different class for WordInfo
  127     WordInfoClass = WordInfo
  128 
  129     def __init__(self):
  130         self.wordinfo = {}
  131         self.probcache = {}
  132         self.nspam = self.nham = 0
  133 
  134     def __getstate__(self):
  135         return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)
  136 
  137     def __setstate__(self, t):
  138         if t[0] != PICKLE_VERSION:
  139             raise ValueError("Can't unpickle -- version %s unknown" % t[0])
  140         (self.wordinfo, self.nspam, self.nham) = t[1:]
  141         self.probcache = {}
  142 
  143     # spamprob() implementations.  One of the following is aliased to
  144     # spamprob, depending on option settings.
  145     # Currently only chi-squared is available, but maybe there will be
  146     # an alternative again someday.
  147 
  148     # Across vectors of length n, containing random uniformly-distributed
  149     # probabilities, -2*sum(ln(p_i)) follows the chi-squared distribution
  150     # with 2*n degrees of freedom.  This has been proven (in some
  151     # appropriate sense) to be the most sensitive possible test for
  152     # rejecting the hypothesis that a vector of probabilities is uniformly
  153     # distributed.  Gary Robinson's original scheme was monotonic *with*
  154     # this test, but skipped the details.  Turns out that getting closer
  155     # to the theoretical roots gives a much sharper classification, with
  156     # a very small (in # of msgs), but also very broad (in range of scores),
  157     # "middle ground", where most of the mistakes live.  In particular,
  158     # this scheme seems immune to all forms of "cancellation disease":  if
  159     # there are many strong ham *and* spam clues, this reliably scores
  160     # close to 0.5.  Most other schemes are extremely certain then -- and
  161     # often wrong.
  162     def chi2_spamprob(self, wordstream, evidence=False):
  163         """Return best-guess probability that wordstream is spam.
  164 
  165         wordstream is an iterable object producing words.
  166         The return value is a float in [0.0, 1.0].
  167 
  168         If optional arg evidence is True, the return value is a pair
  169             probability, evidence
  170         where evidence is a list of (word, probability) pairs.
  171         """
  172 
  173         from math import frexp, log as ln
  174 
  175         # We compute two chi-squared statistics, one for ham and one for
  176         # spam.  The sum-of-the-logs business is more sensitive to probs
  177         # near 0 than to probs near 1, so the spam measure uses 1-p (so
  178         # that high-spamprob words have greatest effect), and the ham
  179         # measure uses p directly (so that lo-spamprob words have greatest
  180         # effect).
  181         #
  182         # For optimization, sum-of-logs == log-of-product, and f.p.
  183         # multiplication is a lot cheaper than calling ln().  It's easy
  184         # to underflow to 0.0, though, so we simulate unbounded dynamic
  185         # range via frexp.  The real product H = this H * 2**Hexp, and
  186         # likewise the real product S = this S * 2**Sexp.
  187         H = S = 1.0
  188         Hexp = Sexp = 0
  189 
  190         clues = self._getclues(wordstream)
  191         for prob, word, record in clues:
  192             S *= 1.0 - prob
  193             H *= prob
  194             if S < 1e-200:  # prevent underflow
  195                 S, e = frexp(S)
  196                 Sexp += e
  197             if H < 1e-200:  # prevent underflow
  198                 H, e = frexp(H)
  199                 Hexp += e
  200 
  201         # Compute the natural log of the product = sum of the logs:
  202         # ln(x * 2**i) = ln(x) + i * ln(2).
  203         S = ln(S) + Sexp * LN2
  204         H = ln(H) + Hexp * LN2
  205 
  206         n = len(clues)
  207         if n:
  208             S = 1.0 - chi2Q(-2.0 * S, 2*n)
  209             H = 1.0 - chi2Q(-2.0 * H, 2*n)
  210 
  211             # How to combine these into a single spam score?  We originally
  212             # used (S-H)/(S+H) scaled into [0., 1.], which equals S/(S+H).  A
  213             # systematic problem is that we could end up being near-certain
  214             # a thing was (for example) spam, even if S was small, provided
  215             # that H was much smaller.
  216             # Rob Hooft stared at these problems and invented the measure
  217             # we use now, the simpler S-H, scaled into [0., 1.].
  218             prob = (S-H + 1.0) / 2.0
  219         else:
  220             prob = 0.5
  221 
  222         if evidence:
  223             clues = [(w, p) for p, w, r in clues]
  224             clues.sort(lambda a, b: cmp(a[1], b[1]))
  225             clues.insert(0, ('*S*', S))
  226             clues.insert(0, ('*H*', H))
  227             return prob, clues
  228         else:
  229             return prob
  230 
  231     def slurping_spamprob(self, wordstream, evidence=False):
  232         """Do the standard chi-squared spamprob, but if the evidence
  233         leaves the score in the unsure range, and we have fewer tokens
  234         than max_discriminators, also generate tokens from the text
  235         obtained by following http URLs in the message."""
  236         h_cut = options["Categorization", "ham_cutoff"]
  237         s_cut = options["Categorization", "spam_cutoff"]
  238 
  239         # Get the raw score.
  240         prob, clues = self.chi2_spamprob(wordstream, True)
  241 
  242         # If necessary, enhance it with the tokens from whatever is
  243         # at the URL's destination.
  244         if len(clues) < options["Classifier", "max_discriminators"] and \
  245            prob > h_cut and prob < s_cut and slurp_wordstream:
  246             slurp_tokens = list(self._generate_slurp())
  247             slurp_tokens.extend([w for (w,p) in clues])
  248             sprob, sclues = self.chi2_spamprob(slurp_tokens, True)
  249             if sprob < h_cut or sprob > s_cut:
  250                 prob = sprob
  251                 clues = sclues
  252         if evidence:
  253             return prob, clues
  254         return prob
  255 
  256     if options["Classifier", "use_chi_squared_combining"]:
  257         if options["URLRetriever", "x-slurp_urls"]:
  258             spamprob = slurping_spamprob
  259         else:
  260             spamprob = chi2_spamprob
  261 
  262     def learn(self, wordstream, is_spam):
  263         """Teach the classifier by example.
  264 
  265         wordstream is a word stream representing a message.  If is_spam is
  266         True, you're telling the classifier this message is definitely spam,
  267         else that it's definitely not spam.
  268         """
  269         if options["Classifier", "x-use_bigrams"]:
  270             wordstream = self._enhance_wordstream(wordstream)
  271         if options["URLRetriever", "x-slurp_urls"]:
  272             wordstream = self._add_slurped(wordstream)
  273         self._add_msg(wordstream, is_spam)
  274 
  275     def unlearn(self, wordstream, is_spam):
  276         """In case of pilot error, call unlearn ASAP after screwing up.
  277 
  278         Pass the same arguments you passed to learn().
  279         """
  280         if options["Classifier", "x-use_bigrams"]:
  281             wordstream = self._enhance_wordstream(wordstream)
  282         if options["URLRetriever", "x-slurp_urls"]:
  283             wordstream = self._add_slurped(wordstream)
  284         self._remove_msg(wordstream, is_spam)
  285 
  286     def probability(self, record):
  287         """Compute, store, and return prob(msg is spam | msg contains word).
  288 
  289         This is the Graham calculation, but stripped of biases, and
  290         stripped of clamping into 0.01 thru 0.99.  The Bayesian
  291         adjustment following keeps them in a sane range, and one
  292         that naturally grows the more evidence there is to back up
  293         a probability.
  294         """
  295 
  296         spamcount = record.spamcount
  297         hamcount = record.hamcount
  298 
  299         # Try the cache first
  300         try:
  301             return self.probcache[spamcount][hamcount]
  302         except KeyError:
  303             pass
  304 
  305         nham = float(self.nham or 1)
  306         nspam = float(self.nspam or 1)
  307 
  308         assert hamcount <= nham
  309         hamratio = hamcount / nham
  310 
  311         assert spamcount <= nspam
  312         spamratio = spamcount / nspam
  313 
  314         prob = spamratio / (hamratio + spamratio)
  315 
  316         S = options["Classifier", "unknown_word_strength"]
  317         StimesX = S * options["Classifier", "unknown_word_prob"]
  318 
  319 
  320         # Now do Robinson's Bayesian adjustment.
  321         #
  322         #         s*x + n*p(w)
  323         # f(w) = --------------
  324         #           s + n
  325         #
  326         # I find this easier to reason about like so (equivalent when
  327         # s != 0):
  328         #
  329         #        x - p
  330         #  p +  -------
  331         #       1 + n/s
  332         #
  333         # IOW, it moves p a fraction of the distance from p to x, and
  334         # less so the larger n is, or the smaller s is.
  335 
  336         n = hamcount + spamcount
  337         prob = (StimesX + n * prob) / (S + n)
  338 
  339         # Update the cache
  340         try:
  341             self.probcache[spamcount][hamcount] = prob
  342         except KeyError:
  343             self.probcache[spamcount] = {hamcount: prob}
  344 
  345         return prob
  346 
  347     # NOTE:  Graham's scheme had a strange asymmetry:  when a word appeared
  348     # n>1 times in a single message, training added n to the word's hamcount
  349     # or spamcount, but predicting scored words only once.  Tests showed
  350     # that adding only 1 in training, or scoring more than once when
  351     # predicting, hurt under the Graham scheme.
  352     # This isn't so under Robinson's scheme, though:  results improve
  353     # if training also counts a word only once.  The mean ham score decreases
  354     # significantly and consistently, ham score variance decreases likewise,
  355     # mean spam score decreases (but less than mean ham score, so the spread
  356     # increases), and spam score variance increases.
  357     # I (Tim) speculate that adding n times under the Graham scheme helped
  358     # because it acted against the various ham biases, giving frequently
  359     # repeated spam words (like "Viagra") a quick ramp-up in spamprob; else,
  360     # adding only once in training, a word like that was simply ignored until
  361     # it appeared in 5 distinct training spams.  Without the ham-favoring
  362     # biases, though, and never ignoring words, counting n times introduces
  363     # a subtle and unhelpful bias.
  364     # There does appear to be some useful info in how many times a word
  365     # appears in a msg, but distorting spamprob doesn't appear a correct way
  366     # to exploit it.
  367     def _add_msg(self, wordstream, is_spam):
  368         self.probcache = {}    # nuke the prob cache
  369         if is_spam:
  370             self.nspam += 1
  371         else:
  372             self.nham += 1
  373 
  374         for word in Set(wordstream):
  375             record = self._wordinfoget(word)
  376             if record is None:
  377                 record = self.WordInfoClass()
  378 
  379             if is_spam:
  380                 record.spamcount += 1
  381             else:
  382                 record.hamcount += 1
  383 
  384             self._wordinfoset(word, record)
  385 
  386         self._post_training()
  387 
  388     def _remove_msg(self, wordstream, is_spam):
  389         self.probcache = {}    # nuke the prob cache
  390         if is_spam:
  391             if self.nspam <= 0:
  392                 raise ValueError("spam count would go negative!")
  393             self.nspam -= 1
  394         else:
  395             if self.nham <= 0:
  396                 raise ValueError("non-spam count would go negative!")
  397             self.nham -= 1
  398 
  399         for word in Set(wordstream):
  400             record = self._wordinfoget(word)
  401             if record is not None:
  402                 if is_spam:
  403                     if record.spamcount > 0:
  404                         record.spamcount -= 1
  405                 else:
  406                     if record.hamcount > 0:
  407                         record.hamcount -= 1
  408                 if record.hamcount == 0 == record.spamcount:
  409                     self._wordinfodel(word)
  410                 else:
  411                     self._wordinfoset(word, record)
  412 
  413         self._post_training()
  414 
  415     def _post_training(self):
  416         """This is called after training on a wordstream.  Subclasses might
  417         want to ensure that their databases are in a consistent state at
  418         this point.  Introduced to fix bug #797890."""
  419         pass
  420 
  421     # Return list of (prob, word, record) triples, sorted by increasing
  422     # prob.  "word" is a token from wordstream; "prob" is its spamprob (a
  423     # float in 0.0 through 1.0); and "record" is word's associated
  424     # WordInfo record if word is in the training database, or None if it's
  425     # not.  No more than max_discriminators items are returned, and have
  426     # the strongest (farthest from 0.5) spamprobs of all tokens in wordstream.
  427     # Tokens with spamprobs less than minimum_prob_strength away from 0.5
  428     # aren't returned.
  429     def _getclues(self, wordstream):
  430         mindist = options["Classifier", "minimum_prob_strength"]
  431 
  432         if options["Classifier", "x-use_bigrams"]:
  433             # This scheme mixes single tokens with pairs of adjacent tokens.
  434             # wordstream is "tiled" into non-overlapping unigrams and
  435             # bigrams.  Non-overlap is important to prevent a single original
  436             # token from contributing to more than one spamprob returned
  437             # (systematic correlation probably isn't a good thing).
  438 
  439             # First fill list raw with
  440             #     (distance, prob, word, record), indices
  441             # pairs, one for each unigram and bigram in wordstream.
  442             # indices is a tuple containing the indices (0-based relative to
  443             # the start of wordstream) of the tokens that went into word.
  444             # indices is a 1-tuple for an original token, and a 2-tuple for
  445             # a synthesized bigram token.  The indices are needed to detect
  446             # overlap later.
  447             raw = []
  448             push = raw.append
  449             pair = None
  450             # Keep track of which tokens we've already seen.
  451             # Don't use a Set here!  This is an innermost loop, so speed is
  452             # important here (direct dict fiddling is much quicker than
  453             # invoking Python-level Set methods; in Python 2.4 that will
  454             # change).
  455             seen = {pair: 1} # so the bigram token is skipped on 1st loop trip
  456             for i, token in enumerate(wordstream):
  457                 if i:   # not the 1st loop trip, so there is a preceding token
  458                     # This string interpolation must match the one in
  459                     # _enhance_wordstream().
  460                     pair = "bi:%s %s" % (last_token, token)
  461                 last_token = token
  462                 for clue, indices in (token, (i,)), (pair, (i-1, i)):
  463                     if clue not in seen:    # as always, skip duplicates
  464                         seen[clue] = 1
  465                         tup = self._worddistanceget(clue)
  466                         if tup[0] >= mindist:
  467                             push((tup, indices))
  468 
  469             # Sort raw, strongest to weakest spamprob.
  470             raw.sort()
  471             raw.reverse()
  472             # Fill clues with the strongest non-overlapping clues.
  473             clues = []
  474             push = clues.append
  475             # Keep track of which indices have already contributed to a
  476             # clue in clues.
  477             seen = {}
  478             for tup, indices in raw:
  479                 overlap = [i for i in indices if i in seen]
  480                 if not overlap: # no overlap with anything already in clues
  481                     for i in indices:
  482                         seen[i] = 1
  483                     push(tup)
  484             # Leave sorted from smallest to largest spamprob.
  485             clues.reverse()
  486 
  487         else:
  488             # The all-unigram scheme just scores the tokens as-is.  A Set()
  489             # is used to weed out duplicates at high speed.
  490             clues = []
  491             push = clues.append
  492             for word in Set(wordstream):
  493                 tup = self._worddistanceget(word)
  494                 if tup[0] >= mindist:
  495                     push(tup)
  496             clues.sort()
  497 
  498         if len(clues) > options["Classifier", "max_discriminators"]:
  499             del clues[0 : -options["Classifier", "max_discriminators"]]
  500         # Return (prob, word, record).
  501         return [t[1:] for t in clues]
  502 
  503     def _worddistanceget(self, word):
  504         record = self._wordinfoget(word)
  505         if record is None:
  506             prob = options["Classifier", "unknown_word_prob"]
  507         else:
  508             prob = self.probability(record)
  509         distance = abs(prob - 0.5)
  510         return distance, prob, word, record
  511 
  512     def _wordinfoget(self, word):
  513         return self.wordinfo.get(word)
  514 
  515     def _wordinfoset(self, word, record):
  516         self.wordinfo[word] = record
  517 
  518     def _wordinfodel(self, word):
  519         del self.wordinfo[word]
  520 
  521     def _enhance_wordstream(self, wordstream):
  522         """Add bigrams to the wordstream.
  523 
  524         For example, a b c -> a b "a b" c "b c"
  525 
  526         Note that these are *token* bigrams, and not *word* bigrams - i.e.
  527         'synthetic' tokens get bigram'ed, too.
  528 
  529         The bigram token is simply "bi:unigram1 unigram2" - a space should
  530         be sufficient as a separator, since spaces aren't in any other
  531         tokens, apart from 'synthetic' ones.  The "bi:" prefix is added
  532         to avoid conflict with tokens we generate (like "subject: word",
  533         which could be "word" in a subject, or a bigram of "subject:" and
  534         "word").
  535 
  536         If the experimental "Classifier":"x-use_bigrams" option is
  537         removed, this function can be removed, too.
  538         """
  539 
  540         last = None
  541         for token in wordstream:
  542             yield token
  543             if last:
  544                 # This string interpolation must match the one in
  545                 # _getclues().
  546                 yield "bi:%s %s" % (last, token)
  547             last = token
  548 
  549     def _generate_slurp(self):
  550         # We don't want to do this recursively and check URLs
  551         # on webpages, so we have this little cheat.
  552         if not hasattr(self, "setup_done"):
  553             self.setup()
  554             self.setup_done = True
  555         if not hasattr(self, "do_slurp") or self.do_slurp:
  556             if slurp_wordstream:
  557                 self.do_slurp = False
  558 
  559                 tokens = self.slurp(*slurp_wordstream)
  560                 self.do_slurp = True
  561                 self._save_caches()
  562                 return tokens
  563         return []
  564 
  565     def setup(self):
  566         # Can't import this at the top because it's circular.
  567         # XXX Someone smarter than me, please figure out the right
  568         # XXX way to do this.
  569         from spambayes.FileCorpus import ExpiryFileCorpus, FileMessageFactory
  570 
  571         username = options["globals", "proxy_username"]
  572         password = options["globals", "proxy_password"]
  573         server = options["globals", "proxy_server"]
  574         if server.find(":") != -1:
  575             server, port = server.split(':', 1)
  576         else:
  577             port = 8080
  578         if server:
  579             # Build a new opener that uses a proxy requiring authorization
  580             proxy_support = urllib2.ProxyHandler({"http" : \
  581                                                   "http://%s:%s@%s:%d" % \
  582                                                   (username, password,
  583                                                    server, port)})
  584             opener = urllib2.build_opener(proxy_support,
  585                                           urllib2.HTTPHandler)
  586         else:
  587             # Build a new opener without any proxy information.
  588             opener = urllib2.build_opener(urllib2.HTTPHandler)
  589 
  590         # Install it
  591         urllib2.install_opener(opener)
  592 
  593         # Setup the cache for retrieved urls
  594         age = options["URLRetriever", "x-cache_expiry_days"]*24*60*60
  595         dir = options["URLRetriever", "x-cache_directory"]
  596         if not os.path.exists(dir):
  597             # Create the directory.
  598             if options["globals", "verbose"]:
  599                 print >>sys.stderr, "Creating URL cache directory"
  600             os.makedirs(dir)
  601 
  602         self.urlCorpus = ExpiryFileCorpus(age, FileMessageFactory(),
  603                                           dir, cacheSize=20)
  604         # Kill any old information in the cache
  605         self.urlCorpus.removeExpiredMessages()
  606 
  607         # Setup caches for unretrievable urls
  608         self.bad_url_cache_name = os.path.join(dir, "bad_urls.pck")
  609         self.http_error_cache_name = os.path.join(dir, "http_error_urls.pck")
  610         if os.path.exists(self.bad_url_cache_name):
  611             b_file = file(self.bad_url_cache_name, "r")
  612             try:
  613                 self.bad_urls = pickle.load(b_file)
  614             except IOError, ValueError:
  615                 # Something went wrong loading it (bad pickle,
  616                 # probably).  Start afresh.
  617                 if options["globals", "verbose"]:
  618                     print >>sys.stderr, "Bad URL pickle, using new."
  619                 self.bad_urls = {"url:non_resolving": (),
  620                                  "url:non_html": (),
  621                                  "url:unknown_error": ()}
  622             b_file.close()
  623         else:
  624             if options["globals", "verbose"]:
  625                 print "URL caches don't exist: creating"
  626             self.bad_urls = {"url:non_resolving": (),
  627                         "url:non_html": (),
  628                         "url:unknown_error": ()}
  629         if os.path.exists(self.http_error_cache_name):
  630             h_file = file(self.http_error_cache_name, "r")
  631             try:
  632                 self.http_error_urls = pickle.load(h_file)
  633             except IOError, ValueError:
  634                 # Something went wrong loading it (bad pickle,
  635                 # probably).  Start afresh.
  636                 if options["globals", "verbose"]:
  637                     print >>sys.stderr, "Bad HHTP error pickle, using new."
  638                 self.http_error_urls = {}
  639             h_file.close()
  640         else:
  641             self.http_error_urls = {}
  642 
  643     def _save_caches(self):
  644         # XXX Note that these caches are never refreshed, which might not
  645         # XXX be a good thing long-term (if a previously invalid URL
  646         # XXX becomes valid, for example).
  647         for name, data in [(self.bad_url_cache_name, self.bad_urls),
  648                            (self.http_error_cache_name, self.http_error_urls),]:
  649             # Save to a temp file first, in case something goes wrong.
  650             cache = open(name + ".tmp", "w")
  651             pickle.dump(data, cache)
  652             cache.close()
  653             try:
  654                 os.rename(name + ".tmp", name)
  655             except OSError:
  656                 # Atomic replace isn't possible with win32, so just
  657                 # remove and rename.
  658                 os.remove(name)
  659                 os.rename(name + ".tmp", name)
  660 
  661     def slurp(self, proto, url):
  662         # We generate these tokens:
  663         #  url:non_resolving
  664         #  url:non_html
  665         #  url:http_XXX (for each type of http error encounted,
  666         #                for example 404, 403, ...)
  667         # And tokenise the received page (but we do not slurp this).
  668         # Actually, the special url: tokens barely showed up in my testing,
  669         # although I would have thought that they would more - this might
  670         # be due to an error, although they do turn up on occasion.  In
  671         # any case, we have to do the test, so generating an extra token
  672         # doesn't cost us anything apart from another entry in the db, and
  673         # it's only two entries, plus one for each type of http error
  674         # encountered, so it's pretty neglible.
  675         from spambayes.tokenizer import Tokenizer
  676 
  677         if options["URLRetriever", "x-only_slurp_base"]:
  678             url = self._base_url(url)
  679 
  680         # Check the unretrievable caches
  681         for err in self.bad_urls.keys():
  682             if url in self.bad_urls[err]:
  683                 return [err]
  684         if self.http_error_urls.has_key(url):
  685             return self.http_error_urls[url]
  686 
  687         # We check if the url will resolve first
  688         mo = DOMAIN_AND_PORT_RE.match(url)
  689         domain = mo.group(1)
  690         if mo.group(3) is None:
  691             port = 80
  692         else:
  693             port = mo.group(3)
  694         try:
  695             not_used = socket.getaddrinfo(domain, port)
  696         except socket.error:
  697             self.bad_urls["url:non_resolving"] += (url,)
  698             return ["url:non_resolving"]
  699 
  700         # If the message is in our cache, then we can just skip over
  701         # retrieving it from the network, and get it from there, instead.
  702         url_key = URL_KEY_RE.sub('_', url)
  703         cached_message = self.urlCorpus.get(url_key)
  704 
  705         if cached_message is None:
  706             # We're going to ignore everything that isn't text/html,
  707             # so we might as well not bother retrieving anything with
  708             # these extensions.
  709             parts = url.split('.')
  710             if parts[-1] in ('jpg', 'gif', 'png', 'css', 'js'):
  711                 self.bad_urls["url:non_html"] += (url,)
  712                 return ["url:non_html"]
  713 
  714             try:
  715                 if options["globals", "verbose"]:
  716                     print >>sys.stderr, "Slurping", url
  717                 f = urllib2.urlopen("%s://%s" % (proto, url))
  718             except (urllib2.URLError, socket.error), details:
  719                 mo = HTTP_ERROR_RE.match(str(details))
  720                 if mo:
  721                     self.http_error_urls[url] = "url:http_" + mo.group(1)
  722                     return ["url:http_" + mo.group(1)]
  723                 self.bad_urls["url:unknown_error"] += (url,)
  724                 return ["url:unknown_error"]
  725 
  726             try:
  727                 # Anything that isn't text/html is ignored
  728                 content_type = f.info().get('content-type')
  729                 if content_type is None or \
  730                    not content_type.startswith("text/html"):
  731                     self.bad_urls["url:non_html"] += (url,)
  732                     return ["url:non_html"]
  733 
  734                 page = f.read()
  735                 headers = str(f.info())
  736                 f.close()
  737             except socket.error:
  738                 # This is probably a temporary error, like a timeout.
  739                 # For now, just bail out.
  740                 return []
  741 
  742             fake_message_string = headers + "\r\n" + page
  743 
  744             # Retrieving the same messages over and over again will tire
  745             # us out, so we store them in our own wee cache.
  746             message = self.urlCorpus.makeMessage(url_key)
  747             message.setPayload(fake_message_string)
  748             self.urlCorpus.addMessage(message)
  749         else:
  750             fake_message_string = cached_message.as_string()
  751 
  752         msg = message_from_string(fake_message_string)
  753 
  754         # We don't want to do full header tokenising, as this is
  755         # optimised for messages, not webpages, so we just do the
  756         # basic stuff.
  757         bht = options["Tokenizer", "basic_header_tokenize"]
  758         bhto = options["Tokenizer", "basic_header_tokenize_only"]
  759         options["Tokenizer", "basic_header_tokenize"] = True
  760         options["Tokenizer", "basic_header_tokenize_only"] = True
  761 
  762         tokens = Tokenizer().tokenize(msg)
  763         pf = options["URLRetriever", "x-web_prefix"]
  764         tokens = ["%s%s" % (pf, tok) for tok in tokens]
  765 
  766         # Undo the changes
  767         options["Tokenizer", "basic_header_tokenize"] = bht
  768         options["Tokenizer", "basic_header_tokenize_only"] = bhto
  769         return tokens
  770 
  771     def _base_url(self, url):
  772         # To try and speed things up, and to avoid following
  773         # unique URLS, we convert the URL to as basic a form
  774         # as we can - so http://www.massey.ac.nz/~tameyer/index.html?you=me
  775         # would become http://massey.ac.nz and http://id.example.com
  776         # would become http://example.com
  777         url += '/'
  778         domain, garbage = url.split('/', 1)
  779         parts = domain.split('.')
  780         if len(parts) > 2:
  781             base_domain = parts[-2] + '.' + parts[-1]
  782             if len(parts[-1]) < 3:
  783                 base_domain = parts[-3] + '.' + base_domain
  784         else:
  785             base_domain = domain
  786         return base_domain
  787 
  788     def _add_slurped(self, wordstream):
  789         """Add tokens generated by 'slurping' (i.e. tokenizing
  790         the text at the web pages pointed to by URLs in messages)
  791         to the wordstream."""
  792         for token in wordstream:
  793             yield token
  794         slurped_tokens = self._generate_slurp()
  795         for token in slurped_tokens:
  796             yield token
  797 
  798     def _wordinfokeys(self):
  799         return self.wordinfo.keys()
  800 
  801 
  802 Bayes = Classifier