Talk:Machine Learning: Difference between revisions
Jump to navigation
Jump to search
some more glue code to slurp nb-discuss content |
m updated codes ftw |
||
| Line 13: | Line 13: | ||
from contextlib import closing | from contextlib import closing | ||
def | def decompress_url(u): | ||
with closing(urlopen(u)) as f: | with closing(urlopen(u)) as f: | ||
with closing(StringIO(f.read())) as fs: | with closing(StringIO(f.read())) as fs: | ||
| Line 19: | Line 19: | ||
return g.read() | return g.read() | ||
def | def date_in_discuss(m, y): | ||
if m < | if 1 <= m <= 12: | ||
if y > 2007: | |||
now = gmtime() | |||
yy, mm = now.tm_year, now.tm_mon | |||
if (y < yy) or ((y == yy) and (m <= mm)): | |||
return True | |||
elif (y == 2007) and (m >= 11): | |||
return True | |||
return False | |||
def datestr(m, y): | |||
try: | |||
ms = ('January', 'February', 'March', | |||
'April', 'May', 'June', 'July', | |||
'August', 'September', 'October', | |||
'November', 'December')[m - 1] | |||
return '-'.join((str(y), ms)) | |||
except IndexError: | |||
return None | return None | ||
def nb_gz_url(m, y, listname='noisebridge-discuss'): | |||
if not date_in_discuss(m, y): | |||
if (y | |||
return None | return None | ||
a = 'https://www.noisebridge.net/' | |||
b = 'pipermail/' | |||
c = '/'.join((listname, '')) | |||
d = datestr(m, y) | |||
return ''.join(( | e = '.txt.gz' | ||
return ''.join((a, b, c, d, e)) | |||
def | def all_nb_gz_urls(): | ||
now = gmtime() | now = gmtime() | ||
yy, mm = now.tm_year, now.tm_mon | |||
y, m = 2007, 11 | |||
while (y < yy) or ((y == yy) and (m <= mm)): | |||
yield nb_gz_url(m, y) | |||
if m < 12: | |||
m += 1 | |||
else: | else: | ||
m = 1 | |||
y += 1 | |||
def | def get_month(month, year): | ||
u = | u = nb_gz_url(month, year) | ||
s = | s = decompress_url(u) | ||
return s | return s | ||
def spew(): | def spew(): | ||
for u in | for u in all_nb_gz_urls(): | ||
yield | yield decompress_url(u) | ||
def dump_uncompressed(filename= | def dump_uncompressed(filename='nb_wtf.txt'): | ||
with open(filename, | with open(filename, 'w') as f: | ||
for s in spew(): | for s in spew(): | ||
f.write(s) | f.write(s) | ||
def | def compiled_pattern(key, cache={}): | ||
try: | |||
return cache[key] | |||
except KeyError: | |||
if key == 'msg_start': | |||
p = msg_start_pattern() | |||
elif key == 'msg_stop': | |||
p = msg_stop_pattern() | |||
else: | |||
return None | |||
cache[key] = re.compile(p) | |||
return cache[key] | |||
def msg_start_pattern(): | |||
# ... and so it begins: | # ... and so it begins: | ||
# 'From jacob at appelbaum.net Tue Nov 20 20:20:07 2007' | # 'From jacob at appelbaum.net Tue Nov 20 20:20:07 2007' | ||
# -> r'^From | # -> r'^From .*\s+\w{3}\s+\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2} \d{4}$' | ||
# (return compiled regex roughly equivalent to the above) | # (return compiled regex roughly equivalent to the above) | ||
space = r'\s+' | space = r'\s+' | ||
datestr = space.join((r'\w{3}', r'\w{3}', r'\d+', | |||
r'\d{2}:\d{2}:\d{2} \d{4}')) | |||
datestr = space.join((r'\w{3}', r'\w{3}', r'\d+' | pattern = ''.join(('^', 'From .*', space, datestr, '$')) | ||
return re.compile(pattern) | |||
def msg_stop_pattern(): | |||
anchor = lambda s: ''.join(('^', s, '$')) | anchor = lambda s: ''.join(('^', s, '$')) | ||
htmldelim = anchor('-------------- next part --------------') | |||
listdelim = anchor('_______________________________________________') | |||
pattern = '|'.join((htmldelim, listdelim)) | |||
return re.compile(pattern) | |||
def msglists(s | def msglists(s): | ||
# | # yields list of strings for each msg in string s | ||
msg = [] | msg = [] | ||
p = compiled_pattern('msg_start') | |||
for r in s.splitlines(): | for r in s.splitlines(): | ||
if | if p.match(r): | ||
if msg: | if msg: | ||
yield msg | yield msg | ||
| Line 84: | Line 120: | ||
yield msg | yield msg | ||
def msg2dict(msg | def msg2dict(msg): | ||
# msg is list of strings | # msg is list of strings | ||
# return dict with headers, contents, | # return dict with headers, contents, cruft | ||
d = dict() | d = dict() | ||
if not msg | p = compiled_pattern('msg_start') | ||
if not (msg and p.match(msg[0])): | |||
d['bogus'] = msg | d['bogus'] = msg | ||
return d | return d | ||
cruft = '' | |||
for s in | ss = iter(msg) | ||
d['fromkey'] = next(ss) | |||
header_list = [] | |||
for s in ss: | |||
t = s.split(':', 1) | t = s.split(':', 1) | ||
if len(t) == | if len(t) != 2: | ||
try: | |||
header_list[-1][1] += s | |||
except IndexError: | |||
print 'this happened ???' | |||
header_list.append(['bogus_header', s]) | |||
else: | |||
k, v = t | k, v = t | ||
header_list.append([k, v.strip()]) | |||
if k == 'Message-ID': | if k == 'Message-ID': | ||
break | break | ||
d['headers'] = dict(header_list) | |||
# skip blank line(s) | |||
s = next(ss) | |||
while not s: | |||
s = next(ss) | |||
contents = [s.rstrip()] | |||
cruft = [] | |||
p = compiled_pattern('msg_stop') | |||
for s in ss: | |||
if p.match(s): | |||
cruft.append(s) | |||
break | |||
else: | else: | ||
d[' | contents.append(s.rstrip()) | ||
d['contents'] = contents | |||
if cruft: | |||
cruft.extend([s.rstrip() for s in ss]) | |||
d['cruft'] = cruft | |||
return d | return d | ||
| Line 112: | Line 168: | ||
smtp = dict() | smtp = dict() | ||
msgd = msg2dict(msg) | msgd = msg2dict(msg) | ||
q = (('From', 'fromline'), ('Date', 'dateline'), ('Subject', 'subjectline')) | headers = msgd['headers'] | ||
for | q = (('From', 'fromline'), | ||
('Date', 'dateline'), | |||
('Subject', 'subjectline')) | |||
for k, v in q: | |||
try: | |||
smtp[v] = headers[k] | |||
except KeyError: | |||
print 'header not found: ', v | |||
continue | |||
smtp['messageline'] = '\n'.join(msgd['contents']) | |||
return smtp | return smtp | ||
def dicterator(s): | def dicterator(s): | ||
for msg in msglists(s): | for msg in msglists(s): | ||
yield msg2dict(msg) | yield msg2dict(msg) | ||
def smtperator(s): | def smtperator(s): | ||
for msg in msglists(s): | for msg in msglists(s): | ||
yield msg2smtp(msg) | yield msg2smtp(msg) | ||
</pre> | </pre> | ||
Revision as of 19:49, 6 March 2014
Feb. 27, 2014
Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, File:Py-piper-parser.txt. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon.
python to download and decompress nb-discuss archive
import re
from itertools import chain, islice
from StringIO import StringIO
from gzip import GzipFile
from time import gmtime
from urllib import urlopen
from contextlib import closing
def decompress_url(u):
with closing(urlopen(u)) as f:
with closing(StringIO(f.read())) as fs:
with GzipFile(fileobj = fs) as g:
return g.read()
def date_in_discuss(m, y):
if 1 <= m <= 12:
if y > 2007:
now = gmtime()
yy, mm = now.tm_year, now.tm_mon
if (y < yy) or ((y == yy) and (m <= mm)):
return True
elif (y == 2007) and (m >= 11):
return True
return False
def datestr(m, y):
try:
ms = ('January', 'February', 'March',
'April', 'May', 'June', 'July',
'August', 'September', 'October',
'November', 'December')[m - 1]
return '-'.join((str(y), ms))
except IndexError:
return None
def nb_gz_url(m, y, listname='noisebridge-discuss'):
if not date_in_discuss(m, y):
return None
a = 'https://www.noisebridge.net/'
b = 'pipermail/'
c = '/'.join((listname, ''))
d = datestr(m, y)
e = '.txt.gz'
return ''.join((a, b, c, d, e))
def all_nb_gz_urls():
now = gmtime()
yy, mm = now.tm_year, now.tm_mon
y, m = 2007, 11
while (y < yy) or ((y == yy) and (m <= mm)):
yield nb_gz_url(m, y)
if m < 12:
m += 1
else:
m = 1
y += 1
def get_month(month, year):
u = nb_gz_url(month, year)
s = decompress_url(u)
return s
def spew():
for u in all_nb_gz_urls():
yield decompress_url(u)
def dump_uncompressed(filename='nb_wtf.txt'):
with open(filename, 'w') as f:
for s in spew():
f.write(s)
def compiled_pattern(key, cache={}):
try:
return cache[key]
except KeyError:
if key == 'msg_start':
p = msg_start_pattern()
elif key == 'msg_stop':
p = msg_stop_pattern()
else:
return None
cache[key] = re.compile(p)
return cache[key]
def msg_start_pattern():
# ... and so it begins:
# 'From jacob at appelbaum.net Tue Nov 20 20:20:07 2007'
# -> r'^From .*\s+\w{3}\s+\w{3}\s+\d+\s+\d{2}:\d{2}:\d{2} \d{4}$'
# (return compiled regex roughly equivalent to the above)
space = r'\s+'
datestr = space.join((r'\w{3}', r'\w{3}', r'\d+',
r'\d{2}:\d{2}:\d{2} \d{4}'))
pattern = ''.join(('^', 'From .*', space, datestr, '$'))
return re.compile(pattern)
def msg_stop_pattern():
anchor = lambda s: ''.join(('^', s, '$'))
htmldelim = anchor('-------------- next part --------------')
listdelim = anchor('_______________________________________________')
pattern = '|'.join((htmldelim, listdelim))
return re.compile(pattern)
def msglists(s):
# yields list of strings for each msg in string s
msg = []
p = compiled_pattern('msg_start')
for r in s.splitlines():
if p.match(r):
if msg:
yield msg
msg = []
msg.append(r)
if msg:
yield msg
def msg2dict(msg):
# msg is list of strings
# return dict with headers, contents, cruft
d = dict()
p = compiled_pattern('msg_start')
if not (msg and p.match(msg[0])):
d['bogus'] = msg
return d
cruft = ''
ss = iter(msg)
d['fromkey'] = next(ss)
header_list = []
for s in ss:
t = s.split(':', 1)
if len(t) != 2:
try:
header_list[-1][1] += s
except IndexError:
print 'this happened ???'
header_list.append(['bogus_header', s])
else:
k, v = t
header_list.append([k, v.strip()])
if k == 'Message-ID':
break
d['headers'] = dict(header_list)
# skip blank line(s)
s = next(ss)
while not s:
s = next(ss)
contents = [s.rstrip()]
cruft = []
p = compiled_pattern('msg_stop')
for s in ss:
if p.match(s):
cruft.append(s)
break
else:
contents.append(s.rstrip())
d['contents'] = contents
if cruft:
cruft.extend([s.rstrip() for s in ss])
d['cruft'] = cruft
return d
def msg2smtp(msg):
smtp = dict()
msgd = msg2dict(msg)
headers = msgd['headers']
q = (('From', 'fromline'),
('Date', 'dateline'),
('Subject', 'subjectline'))
for k, v in q:
try:
smtp[v] = headers[k]
except KeyError:
print 'header not found: ', v
continue
smtp['messageline'] = '\n'.join(msgd['contents'])
return smtp
def dicterator(s):
for msg in msglists(s):
yield msg2dict(msg)
def smtperator(s):
for msg in msglists(s):
yield msg2smtp(msg)
Word parsing python script
Function 'get_words' takes list of dictionary of emails. Yields lists of words of in the message, for each message:
def get_words(lst):
for d in lst:
m = d['messageline']
yield m.split()
Plans to improve by using nltk[1]