Talk:Machine Learning: Difference between revisions
Jump to navigation
Jump to search
utility code to retrieve & decompress archived nb-discuss |
|||
| Line 1: | Line 1: | ||
== Feb. 27, 2014 == | == Feb. 27, 2014 == | ||
Folks met and hacked on the noisebridge discuss mailing list. We created a | Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, [[File:Py-piper-parser.txt]]. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon. | ||
== python to download and decompress nb-discuss archive == | |||
<pre> | |||
from StringIO import StringIO | |||
from gzip import GzipFile | |||
from time import gmtime | |||
from urllib import urlopen | |||
def decompress_from_url(u): | |||
# return GzipFile(fileobj = StringIO(urlopen(u).read())).read() | |||
f = urlopen(u) | |||
fs = StringIO(f.read()) | |||
g = GzipFile(fileobj = fs) | |||
s = g.read() | |||
for x in (f, fs, g): | |||
x.close() | |||
return s | |||
def discuss_gz_url(m, y): | |||
if m < 1 or m > 12: | |||
return None | |||
if y < 2007: | |||
return None | |||
now = gmtime() | |||
if (y > now.tm_year) or (y == now.tm_year and m > now.tm_mon): | |||
return None | |||
mm = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December') | |||
nb_pre = 'https://www.noisebridge.net/pipermail/noisebridge-discuss/' | |||
nb_post = '.txt.gz' | |||
s = '-'.join((str(y), mm[m-1])) | |||
return ''.join((nb_pre, s, nb_post)) | |||
def all_discuss_gz_urls(): | |||
now = gmtime() | |||
for y in range(2007, now.tm_year + 1): | |||
if y == 2007: | |||
mm = range(11, 12 + 1) # start with November 2007 | |||
elif y == now.tm_year: | |||
mm = range(1, now.tm_mon + 1) # end with current month | |||
else: | |||
mm = range(1, 13) | |||
for m in mm: | |||
yield discuss_gz_url(m, y) | |||
def discuss_a_month(month, year): | |||
u = discuss_gz_url(month, year) | |||
s = decompress_from_url(u) | |||
return s | |||
def spew(): | |||
for u in all_discuss_gz_urls(): | |||
yield decompress_from_url(u) | |||
def dump_uncompressed(filename="nb_wtf.txt"): | |||
with open(filename, "w") as f: | |||
for s in spew(): | |||
f.write(s) | |||
</pre> | |||
Revision as of 23:47, 1 March 2014
Feb. 27, 2014
Folks met and hacked on the noisebridge discuss mailing list. We created a 102MB text dump, and a python script to parse it, File:Py-piper-parser.txt. We wrote pseudo code to implement a Naive Bayesian filter to protect the world from trolls. Will implement soon.
python to download and decompress nb-discuss archive
from StringIO import StringIO
from gzip import GzipFile
from time import gmtime
from urllib import urlopen
def decompress_from_url(u):
# return GzipFile(fileobj = StringIO(urlopen(u).read())).read()
f = urlopen(u)
fs = StringIO(f.read())
g = GzipFile(fileobj = fs)
s = g.read()
for x in (f, fs, g):
x.close()
return s
def discuss_gz_url(m, y):
if m < 1 or m > 12:
return None
if y < 2007:
return None
now = gmtime()
if (y > now.tm_year) or (y == now.tm_year and m > now.tm_mon):
return None
mm = ('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December')
nb_pre = 'https://www.noisebridge.net/pipermail/noisebridge-discuss/'
nb_post = '.txt.gz'
s = '-'.join((str(y), mm[m-1]))
return ''.join((nb_pre, s, nb_post))
def all_discuss_gz_urls():
now = gmtime()
for y in range(2007, now.tm_year + 1):
if y == 2007:
mm = range(11, 12 + 1) # start with November 2007
elif y == now.tm_year:
mm = range(1, now.tm_mon + 1) # end with current month
else:
mm = range(1, 13)
for m in mm:
yield discuss_gz_url(m, y)
def discuss_a_month(month, year):
u = discuss_gz_url(month, year)
s = decompress_from_url(u)
return s
def spew():
for u in all_discuss_gz_urls():
yield decompress_from_url(u)
def dump_uncompressed(filename="nb_wtf.txt"):
with open(filename, "w") as f:
for s in spew():
f.write(s)
Word parsing python script
Function 'get_words' takes list of dictionary of emails. Yields lists of words of in the message, for each message:
def get_words(lst):
for d in lst:
m = d['messageline']
yield m.split()
Plans to improve by using nltk[1]