about this page: simple compact silly xhtml reports on literature texts with 44 lines of code
#!/usr/bin/env python
# encoding: utf-8
# A game by The MYYN | some text -> some colorful silly stats as xhtml
import codecs, nltk, string, jinja2, random, cgi, sys, os
from operator import itemgetter; from nltk.corpus import stopwords
ALLCOLORS = ('#98FB98', '#FFDEAD', '#FF1493', '#808080', '#D3D3D3', '#FFA500', '#90EE90', '#FFFFFF', '#F0F8FF', '#FF7F50')
COLORS = random.sample(ALLCOLORS, 6)
TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"> <head><title>struts</title><style type="text/css"> body{width:1000px;
margin:auto; font: 75% "Lucida Grande", "Trebuchet MS", Verdana, sans-serif;: monospace;} table{color:gray;} li{ list-style: none; }
#title { background: black; color:white; }.skip { margin-top:15px; } code { font-size: 9px; color:darkslategray; }</style></head><body><div class="b">
<h1 id="title"><strong><a style="color:white;" href="{{ index_url }}">index</a> | {{ title }}</strong></h1>
<!-- <img alt="X" src="http://tinyurl.com/dlqbeu"></img> -->
<div class="skip"></div><p><strong>A table</strong>:</p> <table border="0" cellspacing="3" cellpadding="2">
<tr><td>Words</td><td>{{ word_count }}</td></tr>
<tr><td>Stopwords</td><td>{{ stopword_count }}</td></tr> <tr><td>Unique Words (including stopwords)</td>
<td>{{ unique_word_count }}</td></tr>
<tr><td>Unique Words (excluding stopwords)</td><td>{{ unique_word_count_no_stop }}</td></tr> </table>
<p><strong>50 Longest words</strong>:</p>
<p>{% for word in longest_words %} {{ word }} {% endfor %} </p> <div class="skip"></div>
<!-- <p><strong>50 Shortest words</strong>:</p>
<p>{% for word in shortest_words %} {{ word }} {% endfor %} </p> <div class="skip"></div> -->
<p><strong>Frequency distribution of non-stopwords</strong>:</p>
<!-- <script type="text/javascript">document.write(Math.max(0,1));</script> -->
<p>{% for e in tup %} <span style='font-size: {{ e[1] }}px; background:{{ e[2] }}'>{{ e[0] }}</span> {% endfor %}</p>
<p style="font-size: 10px; text-align:center; border-top: dotted thin gray; margin-top: 5px; padding-top: 5px; color: gray">
A game by <a style="color:gray;" href="http://myyn.org/">The MYYN</a> | <a href="http://dpaste.com/hold/37153/">Source Code</a></p>
<pre><code>{{ the_code }}</code></pre></div></body></html>"""
if not len(sys.argv) >= 2: print "Usage: %s FILENAME [LANGUAGE]" % (sys.argv[0]); sys.exit(1)
if not os.path.exists(sys.argv[1]): print "No such file."; sys.exit(1)
title = os.path.splitext(sys.argv[1])[0]; index_url = "index.html"
try: language = sys.argv[2]
except: language = 'german'
content = codecs.open(sys.argv[1], encoding='latin-1').read(); tokens = map(string.lower, nltk.word_tokenize(content))
word_count, unique_word_count = len(tokens), len(list(set(tokens)))
nostop_tokens = filter(lambda s: s not in stopwords.words(language), tokens)
stopword_count = (word_count - len(nostop_tokens)); unique_word_count_no_stop = len(list(set(nostop_tokens)))
_wordlist = list(set(nostop_tokens)); _wordlist.sort(key=lambda x : len(x))
longest_words = _wordlist[(len(_wordlist) - 50):]; shortest_words = _wordlist[:50]; fdist = nltk.FreqDist(nostop_tokens)
_tup, tup = [], sorted(dict(fdist).items(), key=itemgetter(1), reverse=True)
for item in tup: _tup.append( (item[0], min(160, item[1] * 2 + 8), random.sample(COLORS, 1)[0]) )
tup = _tup; t = jinja2.Template(TEMPLATE); the_code = cgi.escape(open(__file__, 'r').read()).encode("utf-8", "xmlcharrefreplace")
out = codecs.open('%s.html' % (title), encoding='utf-8', mode='w+'); out.write(t.render(locals())); out.close()
A game by The MYYN