Talk:Machine Learning: Difference between revisions

From Noisebridge
Jump to navigation Jump to search
Danf (talk | contribs)
some more glue code to slurp nb-discuss content
Danf (talk | contribs)
m updated codes ftw
Line 13: Line 13:
from contextlib import closing
from contextlib import closing


def decompress_from_url(u):
def decompress_url(u):
   with closing(urlopen(u)) as f:
   with closing(urlopen(u)) as f:
     with closing(StringIO(f.read())) as fs:
     with closing(StringIO(f.read())) as fs:
Line 19: Line 19:
         return g.read()
         return g.read()


def discuss_gz_url(m, y):
def date_in_discuss(m, y):
   if m < 1 or m > 12:
   if 1 <= m <= 12:
    if y > 2007:
      now = gmtime()
      yy, mm = now.tm_year, now.tm_mon
      if (y < yy) or ((y == yy) and (m <= mm)):
        return True
    elif (y == 2007) and (m >= 11):
        return True
  return False
def datestr(m, y):
  try:
    ms = ('January', 'February', 'March',
          'April', 'May', 'June', 'July',
          'August', 'September', 'October',
          'November', 'December')[m - 1]
    return '-'.join((str(y), ms))
  except IndexError:
     return None
     return None
  if y < 2007:
 
    return None
def nb_gz_url(m, y, listname='noisebridge-discuss'):
  now = gmtime()
   if not date_in_discuss(m, y):
   if (y > now.tm_year) or (y == now.tm_year and m > now.tm_mon):
     return None
     return None
   mm = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December')
   a = 'https://www.noisebridge.net/'
  nb_pre = 'https://www.noisebridge.net/pipermail/noisebridge-discuss/'
   b = 'pipermail/'
   nb_post = '.txt.gz'
   c = '/'.join((listname, ''))
   s = '-'.join((str(y), mm[m-1]))
  d = datestr(m, y)
   return ''.join((nb_pre, s, nb_post))
  e = '.txt.gz'
   return ''.join((a, b, c, d, e))


def all_discuss_gz_urls():
def all_nb_gz_urls():
   now = gmtime()
   now = gmtime()
   for y in range(2007, now.tm_year + 1):
   yy, mm = now.tm_year, now.tm_mon
    if y == 2007:
  y, m = 2007, 11
      mm = range(11, 12 + 1) # start with November 2007
  while (y < yy) or ((y == yy) and (m <= mm)):
     elif y == now.tm_year:
    yield nb_gz_url(m, y)
       mm = range(1, now.tm_mon + 1)  # end with current month
     if m < 12:
       m += 1
     else:
     else:
       mm = range(1, 13)
       m = 1
    for m in mm:
       y += 1
       yield discuss_gz_url(m, y)


def discuss_a_month(month, year):
def get_month(month, year):
   u = discuss_gz_url(month, year)
   u = nb_gz_url(month, year)
   s = decompress_from_url(u)
   s = decompress_url(u)
   return s
   return s


def spew():
def spew():
   for u in all_discuss_gz_urls():
   for u in all_nb_gz_urls():
     yield decompress_from_url(u)
     yield decompress_url(u)


def dump_uncompressed(filename="nb_wtf.txt"):
def dump_uncompressed(filename='nb_wtf.txt'):
   with open(filename, "w") as f:
   with open(filename, 'w') as f:
     for s in spew():
     for s in spew():
       f.write(s)
       f.write(s)


def from_at_pattern():
def compiled_pattern(key, cache={}):
  try:
    return cache[key]
  except KeyError:
    if key == 'msg_start':
      p = msg_start_pattern()
    elif key == 'msg_stop':
      p = msg_stop_pattern()
    else:
      return None
    cache[key] = re.compile(p)
    return cache[key]
 
def msg_start_pattern():
   # ... and so it begins:
   # ... and so it begins:
   # 'From jacob at appelbaum.net  Tue Nov 20 20:20:07 2007'
   # 'From jacob at appelbaum.net  Tue Nov 20 20:20:07 2007'
   # -> r'^From \S+ at \S+\s+\w{3}\s+\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2} \d{4}$'
   # -> r'^From .*\s+\w{3}\s+\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2} \d{4}$'
   # (return compiled regex roughly equivalent to the above)
   # (return compiled regex roughly equivalent to the above)
   space = r'\s+'
   space = r'\s+'
  chars = r'\S+'
   datestr = space.join((r'\w{3}', r'\w{3}', r'\d+',
  from_at = space.join(('From', chars, 'at', chars))
                        r'\d{2}:\d{2}:\d{2} \d{4}'))
   datestr = space.join((r'\w{3}', r'\w{3}', r'\d+'))
  pattern = ''.join(('^', 'From .*', space, datestr, '$'))
  timestamp = r'\d{2}:\d{2}:\d{2} \d{4}'
  return re.compile(pattern)
 
def msg_stop_pattern():
   anchor = lambda s: ''.join(('^', s, '$'))
   anchor = lambda s: ''.join(('^', s, '$'))
   return re.compile(anchor(space.join((from_at, datestr, timestamp))))
   htmldelim = anchor('-------------- next part --------------')
  listdelim = anchor('_______________________________________________')
  pattern = '|'.join((htmldelim, listdelim))
  return re.compile(pattern)


def msglists(s, fromp=from_at_pattern()):
def msglists(s):
   # yield list of strings for each msg in string s
   # yields list of strings for each msg in string s
   msg = []
   msg = []
  p = compiled_pattern('msg_start')
   for r in s.splitlines():
   for r in s.splitlines():
     if fromp.match(r):
     if p.match(r):
       if msg:
       if msg:
         yield msg
         yield msg
Line 84: Line 120:
     yield msg
     yield msg


def msg2dict(msg, fromp=from_at_pattern()):
def msg2dict(msg):
   # msg is list of strings
   # msg is list of strings
   # return dict with headers, contents, etc
   # return dict with headers, contents, cruft
   d = dict()
   d = dict()
   if not msg or not fromp.match(msg[0]):
  p = compiled_pattern('msg_start')
   if not (msg and p.match(msg[0])):
     d['bogus'] = msg
     d['bogus'] = msg
     return d
     return d  
   stack = islice(msg, 1, None)
   cruft = ''
   for s in stack:
  ss = iter(msg)
  d['fromkey'] = next(ss)
  header_list = []
   for s in ss:
     t = s.split(':', 1)
     t = s.split(':', 1)
     if len(t) == 2:
     if len(t) != 2:
      try:
        header_list[-1][1] += s
      except IndexError:
        print 'this happened ???'
        header_list.append(['bogus_header', s])
    else:
       k, v = t
       k, v = t
       d[k] = v.strip()
       header_list.append([k, v.strip()])
       if k == 'Message-ID':
       if k == 'Message-ID':
         break
         break
  d['headers'] = dict(header_list)
  # skip blank line(s)
  s = next(ss)
  while not s:
    s = next(ss)
  contents = [s.rstrip()]
  cruft = []
  p = compiled_pattern('msg_stop')
  for s in ss:
    if p.match(s):
      cruft.append(s)
      break
     else:
     else:
       d['bogus'] = s
       contents.append(s.rstrip())
      break
  d['contents'] = contents
   # skip any leading blank lines
   if cruft:
  s = stack.next()
    cruft.extend([s.rstrip() for s in ss])
  while not s:
     d['cruft'] = cruft
     s = stack.next()
  d['contents'] = list(chain((s,), stack))
   return d
   return d


Line 112: Line 168:
   smtp = dict()
   smtp = dict()
   msgd = msg2dict(msg)
   msgd = msg2dict(msg)
   q = (('From', 'fromline'), ('Date', 'dateline'), ('Subject', 'subjectline'))
  headers = msgd['headers']
   for t in q:
   q = (('From', 'fromline'),
     k, j = t
      ('Date', 'dateline'),
    s = msgd.get(k)
      ('Subject', 'subjectline'))
     if s:
   for k, v in q:
       smtp[j] = s
     try:
  if ('bogus' in msgd) or not ('contents' in msgd):
      smtp[v] = headers[k]
    return smtp
     except KeyError:
  message = ''
       print 'header not found: ', v
  htmldelim = '-------------- next part --------------'
      continue
   for s in msgd['contents']:
   smtp['messageline'] = '\n'.join(msgd['contents'])
    if s == htmldelim:
      break
    if message:
      message += '\n'
    message += s
  smtp['messageline'] = message
   return smtp
   return smtp


def dicterator(s):
def dicterator(s):
   for msg in msglists(s):
   for msg in msglists(s):
     yield msg2dict(msg)
     yield msg2dict(msg)  


def smtperator(s):
def smtperator(s):
   for msg in msglists(s):
   for msg in msglists(s):
     yield msg2smtp(msg)
     yield msg2smtp(msg)  


</pre>
</pre>

Revision as of 19:49, 6 March 2014

Feb. 27, 2014

Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, File:Py-piper-parser.txt. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon.

python to download and decompress nb-discuss archive

import re
from itertools import chain, islice
from StringIO import StringIO
from gzip import GzipFile
from time import gmtime
from urllib import urlopen
from contextlib import closing

def decompress_url(u):
  with closing(urlopen(u)) as f:
    with closing(StringIO(f.read())) as fs:
      with GzipFile(fileobj = fs) as g:
        return g.read()

def date_in_discuss(m, y):
  if 1 <= m <= 12:
    if y > 2007:
      now = gmtime()
      yy, mm = now.tm_year, now.tm_mon
      if (y < yy) or ((y == yy) and (m <= mm)):
        return True
    elif (y == 2007) and (m >= 11):
        return True
  return False
 
def datestr(m, y):
  try:
    ms = ('January', 'February', 'March',
          'April', 'May', 'June', 'July',
          'August', 'September', 'October',
          'November', 'December')[m - 1]
    return '-'.join((str(y), ms))
  except IndexError:
    return None

def nb_gz_url(m, y, listname='noisebridge-discuss'):
  if not date_in_discuss(m, y):
    return None
  a = 'https://www.noisebridge.net/'
  b = 'pipermail/'
  c = '/'.join((listname, ''))
  d = datestr(m, y)
  e = '.txt.gz'
  return ''.join((a, b, c, d, e))

def all_nb_gz_urls():
  now = gmtime()
  yy, mm = now.tm_year, now.tm_mon
  y, m = 2007, 11
  while (y < yy) or ((y == yy) and (m <= mm)):
    yield nb_gz_url(m, y)
    if m < 12:
      m += 1
    else:
      m = 1
      y += 1

def get_month(month, year):
  u = nb_gz_url(month, year)
  s = decompress_url(u)
  return s

def spew():
  for u in all_nb_gz_urls():
    yield decompress_url(u)

def dump_uncompressed(filename='nb_wtf.txt'):
  with open(filename, 'w') as f:
    for s in spew():
      f.write(s)

def compiled_pattern(key, cache={}):
  try:
    return cache[key]
  except KeyError:
    if key == 'msg_start':
      p = msg_start_pattern()
    elif key == 'msg_stop':
      p = msg_stop_pattern()
    else:
      return None
    cache[key] = re.compile(p)
    return cache[key]

def msg_start_pattern():
  # ... and so it begins:
  # 'From jacob at appelbaum.net  Tue Nov 20 20:20:07 2007'  
  # -> r'^From .*\s+\w{3}\s+\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2} \d{4}$'
  # (return compiled regex roughly equivalent to the above)
  space = r'\s+'
  datestr = space.join((r'\w{3}', r'\w{3}', r'\d+',
                        r'\d{2}:\d{2}:\d{2} \d{4}'))
  pattern = ''.join(('^', 'From .*', space, datestr, '$'))
  return re.compile(pattern)

def msg_stop_pattern():
  anchor = lambda s: ''.join(('^', s, '$'))
  htmldelim = anchor('-------------- next part --------------')
  listdelim = anchor('_______________________________________________')
  pattern = '|'.join((htmldelim, listdelim))
  return re.compile(pattern)

def msglists(s):
  # yields list of strings for each msg in string s
  msg = []
  p = compiled_pattern('msg_start')
  for r in s.splitlines():
    if p.match(r):
      if msg:
        yield msg
        msg = []
    msg.append(r)
  if msg:
    yield msg

def msg2dict(msg):
  # msg is list of strings
  # return dict with headers, contents, cruft
  d = dict()
  p = compiled_pattern('msg_start')
  if not (msg and p.match(msg[0])):
    d['bogus'] = msg
    return d 
  cruft = ''
  ss = iter(msg)
  d['fromkey'] = next(ss)
  header_list = []
  for s in ss:
    t = s.split(':', 1)
    if len(t) != 2:
      try:
        header_list[-1][1] += s
      except IndexError:
        print 'this happened ???'
        header_list.append(['bogus_header', s])
    else:
      k, v = t
      header_list.append([k, v.strip()])
      if k == 'Message-ID':
        break
  d['headers'] = dict(header_list)
  # skip blank line(s)
  s = next(ss)
  while not s:
    s = next(ss)
  contents = [s.rstrip()]
  cruft = []
  p = compiled_pattern('msg_stop')
  for s in ss:
    if p.match(s):
      cruft.append(s)
      break
    else:
      contents.append(s.rstrip())
  d['contents'] = contents
  if cruft:
    cruft.extend([s.rstrip() for s in ss])
    d['cruft'] = cruft
  return d

def msg2smtp(msg):
  smtp = dict()
  msgd = msg2dict(msg)
  headers = msgd['headers']
  q = (('From', 'fromline'),
       ('Date', 'dateline'),
       ('Subject', 'subjectline'))
  for k, v in q:
    try:
      smtp[v] = headers[k]
    except KeyError:
      print 'header not found: ', v
      continue
  smtp['messageline'] = '\n'.join(msgd['contents'])
  return smtp

def dicterator(s):
  for msg in msglists(s):
    yield msg2dict(msg)   

def smtperator(s):
  for msg in msglists(s):
    yield msg2smtp(msg) 

Word parsing python script

Function 'get_words' takes list of dictionary of emails. Yields lists of words of in the message, for each message:

 def get_words(lst):
   for d in lst:
     m = d['messageline']
     yield m.split()

Plans to improve by using nltk[1]