// ==UserScript==
// @name          Readability Analyzer
// @namespace     http://broofa.com/Tools/GreaseMonkey
// @description	  Appends readability statistics to the end of the page
// @include       http://*
// ==/UserScript==
// Notes:
//   * is a wildcard character
//   .tld is magic that matches all top-level domains (e.g. .com, .co.uk, .us, 

/**
 * Round a number to the given significant digits
 */
Number.prototype.sig = function(nDigits) {
    var x = Math.pow(10, nDigits);
    return Math.round(this*x) / x;
}

/**
 * Escape HTML-sensitive characters
 */
String.prototype.escapeForHTML = function() {
    return this.replace(/</g, '&lt;').replace(/>/g, '&gt;').replace(/&/g, '&amp;');
}

String.prototype.toPlainText = function() {
    var s = this.replace(/<[^>]*>/g, ''); // Remove HTML tags
    s = s.replace(/&\w*;/g, ''); // Remove HTML char codes
    return s;
}

String.prototype.shorten = function(len) {
    return this.length <= len ? this : this.substr(0,len-3) + '...';
}

/**
 * The readability class
 */
function Readability() {
}

// The time this script started running
Readability._time = new Date().getTime();

// Flag indicating whether to show debug info
Readability.DEBUG = !!(location.toString().indexOf('debug_readability') >= 0);

// Tags that we treat as paragraphs
Readability.P_TAGS =  /^(body|h[1-6]|div|li|p|pre|dd|dt|td|th|blockquote)$/i;

// Tags which should not be inside a paragraph
Readability.DQ_TAGS = /^(dl|fieldset|form|ol|table|ul|tbody|tr)$/i;

// Tags that contain content we do not want to process
Readability.SKIP_TAGS = /^(noframes|noscript|frameset|frame|iframe|input|noframes|textarea|script|select|style|head|tfoot|thead)$/i;


Readability._abort = '';
/**
 * Recursive method for gathering readability statisics
 */
Readability.getStats = function(node, stats) {
    // Top level call?
    var myNode = node || document.body;
    stats = stats || {
        paragraphs: 0,
        ignored: 0,
        sentences: 0,
        words: 0,
        hardwords: 0,
        syllables: 0,
        characters: 0
    };

    var valid = false;

    // Skip our content
    if (Readability._abort) {  // Bail out if we've detected a problem
    } else if (myNode.id == 'readability') {  // Skip our own content
    } else if (Readability.SKIP_TAGS.test(myNode.nodeName)) { // Skip tags known to have un-texty content
    } else if (Readability.elapsed() > 1.0) {  // Bail out if we're taking too long
        Readability._abort = 'it was taking to long';
    } else {
        // Mark us as invalid if we're a disqualifying tag
        var valid = !(Readability.DQ_TAGS.test(myNode.nodeName));

        // Flag indicating this is a valid paragraph-ish element
        var isP = Readability.P_TAGS.test(myNode.nodeName);

        // Give the children a chance to gather stats
        for (var i = 0; i < myNode.childNodes.length; i++) {
            var cn = myNode.childNodes[i];

            // Gather stats for the child, and see if it disqualifies us (for
            // whatever reason)
            var childValid = Readability.getStats(cn, stats);

            if (valid && isP && !childValid) {
                var tval = myNode.innerHTML.toPlainText();
                if (tval.length > 43) {
                    tval = tval.substr(0,40) + '...';
                }
                Readability.log(
                        'Disqualifying &lt;' + myNode.nodeName
                        + '&gt; because of child &lt;' + cn.nodeName
                        + '&gt;, value:<br>' + myNode.innerHTML.toPlainText().shorten(40)
                        );
            }

            valid = valid && childValid;
        }


        // If we're a P tag (or equivalent) process the node
        if (isP) {
            if (valid) {
                Readability.processNode(myNode, stats);

                // And P tags are automatic disqualifiers
                valid = false;
            }
        }
    }

    // Top-level call? return the stats
    if (!node) {
        // Fudge factor
        stats.syllables = Math.round(stats.syllables*136/128);

        return stats;
    }

    // Return true to indicate this is a valid branch to readability analyze
    return valid;
}

/**
 * Gather statistics for a known-to-be valid node
 */
Readability.processNode = function(node, stats) {
    text = node.innerHTML.toLowerCase();
    var fullLength = text.length;

    // Clean up the text;
    text = text.toPlainText();
    var naturalLength = text.length;

    text = text.replace(/\s+/g, ' '); // Make all whitespace a space
    text = text.replace(/[!:\?]/g, '.'); // Terminators -> periods
    text = text.replace(/[^a-zA-Z\. ]/g, ''); // Turn all innocuous characters into spaces
    text = text.replace(/\s+/g, ' '); // Collapse spaces down

    // Throw out really short paragraphs or paragraphs that have
    // abnormally high non-alpha to alpha character counts
    var reason = '';
    if (text.length < 18) {
        reason = 'text is too short';
    } else if (naturalLength < fullLength*.75) {
        reason = 'text contains too much markup';
    }
    if (reason) {
        Readability.log('Ignoring &lt;' + node.nodeName + '&gt; because ' + reason);
        stats.ignored++;
        return;
    }

    if (Readability.DEBUG) {
        node.style.border = 'dashed 1px #0f0';
    }

    // Get characters
    var characters = text.replace(/\W/g, '');
    stats.characters += characters.length;

    // Get words
    words = text.split(' ');
    stats.words += words.length;
    for (var j = 0; j < words.length; j++) {
        var sc = Readability.syllableCount(words[j]);
        stats.syllables += sc;
        if (sc >= 3) {
            stats.hardwords++;
        }
    }

    // Get sentences
    sentences = text.replace(/( \. | \.|\. )/g, '|');
    sentences = sentences.split('|');
    stats.sentences += sentences.length;

    // Add up the counts
    stats.paragraphs++;

    Readability.log(
        '&lt;' + node.nodeName
        + '&gt; processed, value:<br>' + node.innerHTML.toPlainText().shorten(40)
    );
}

/**
 * Count the syllables in a word.
 * Note: This is an approximation that is unlikely to ever be accurate.  But
 * it'll be close.
 */
Readability.syllableCount = function(w) {
    var c = 1;
    w = w.toLowerCase().replace(/[^a-z]/, '');

    // Count consonent-vowel pairs
    var cv = w.replace(/[^a-z ]/g, '');
    cv = cv.replace(/[aeiouy]+/gi, 'a');
    cv = cv.replace(/[^aeiouy]+/gi, 'b');
    var cvs = cv.match(/ba/g);
    c = (cvs && cvs.length) ? cvs.length : 1;

    // Subtract one for special endings
    if (c > 1 && w.match(/(e|es|ies|ying|ism|[^aeiou]ed)$/)) {
        c--;
    }

    return c;
}

Readability.CSS = [
      '#readability {'
        , 'color: #444;'
        , 'background-color: #fff;'
        , 'clear:both;'
        , 'margin: 0em;'
        , 'padding: 2px;'
        , 'border-top: solid 1px #888;'
        , 'text-align: left;'
    , '}'
    , '#readability * {font: normal 8pt Arial;}'
    , '#readability a, #readability a:hover, #readability a:visited {font-weight:bold; color:#060}'
    , '#readability-banner {'
        , 'font-weight: bold;'
        , 'float:right;'
        , 'clear:right;'
        , 'margin-left:1em;'
    , '}'
    , '#readability-results {font-weight:bold;}'
    , '#readability-log {display:none;margin:0em; padding:0em; list-style-position:inside; border-top: solid 1px #888;}'
    , '#readability .readability-logentry {font: normal 8pt "Courier New";}'
    , '#readability .readability-warning {color:#990;}'
    , ''
    ];

Readability.HTML = 
          '<div id="readability-banner">'
            + 'Readability Analysis by <a href="http://www.broofa.com/blog/index.php?p=67">Broofa.com</a>'
        + '</div>'
        + '<div id="readability-results"></div>'
        + '<div id="readability-indexes"></div>'
        + '<div id="readability-stats"></div>'
        + '<div id="readability-time"></div>'
    + '<ul id="readability-log"></ul>'
    + '';

Readability.renderContainer = function() {
    // Find/create/insert the head element
    var head = document.getElementsByTagName('head')[0];
    if (!head) {
        head = document.createElement('head');
        document.inserBefore('head', document.body);
    }
    
    // Create/insert the style element
    var sel = document.createElement('style');
    sel.textContent = Readability.CSS.join('\n');
    head.appendChild(sel);

    // Create/insert the new content
    var nel = document.createElement('div');

    nel.id = 'readability';
    nel.innerHTML = Readability.HTML;
    document.body.appendChild(nel);

    Readability.logElement = document.getElementById('readability-log');
    if (Readability.DEBUG) {
        Readability.logElement.style.display = 'block';
    }
}

Readability.removeContainer = function() {
    var el = document.getElementById('readability');
    el.parentNode.removeChild(el);
}

Readability.render = function(name, html) {
    var el = document.getElementById('readability-' + name)
    if (el) {
        el.innerHTML = html;
    }
}

Readability.log = function(msg) {
    if (Readability.DEBUG && Readability.logElement) {
        var nel = document.createElement('li');
        nel.className = 'readability-logentry';
        nel.innerHTML = msg;
        Readability.logElement.appendChild(nel);
    }
}

Readability.elapsed = function() {
    return (new Date().getTime() - Readability._time)/1000;
}

/**
 * Main
 */
Readability.main = function() {
    // Throw down the content we're going to work with
    Readability.renderContainer();

    try {
        var stats = Readability.getStats();
    } catch (e) {
        var html=[];
        for (var key in e) {
            html.push(key + ': ' + e[key]);
        }
        Readability.render('results', 'Exception while getting statistics');
        Readability.render('stats', html.join('<br>'));
        return;
    }

    if (stats.words < 50) {
        //Readability.render('results', 'Unable to determine readability (not enough text found)');
        Readability.removeContainer();
    } else {
        var statshtml = [];
        var warning = !Readability._abort ? '' : '<div class="readability-warning">Processing incomplete because ' + Readability._abort + '</div>';
        for (var key in stats) {
            statshtml.push(key + ':' + stats[key].sig(2));
        }
        Readability.render('stats', warning + 'Page stats: ' + statshtml.join(', '));

        //
        // Compute various readability indexes
        //

        // Gunning-Fog index
        stats.gf = (stats.words/stats.sentences + 100*stats.hardwords/stats.words)*.4;
        // SMOG index
        stats.smog = Math.sqrt(stats.hardwords*30/stats.sentences)+3;
        // Coleman-Liau index
        stats.cl = (5.89 * stats.characters/stats.words) - (.3*stats.sentences)/(100*stats.words) - 15.8;
        // Automated Readability Index
        stats.ari = (4.71*stats.characters/stats.words) + (.5*stats.words/stats.sentences) - 21.43;
        // Flesch-Kincaid grade level
        stats.fk = .39*stats.words/stats.sentences + 11.8*stats.syllables/stats.words - 15.59
        // Flesch-Kincaid index
        stats.fe = 206.835 - 1.015*stats.words/stats.sentences - 84.6*stats.syllables/stats.words;

        // Get the grade and difficulty strings
        grade = Math.round((stats.gf + stats.ari + stats.fk + stats.cl + stats.smog)/5);
        var difficulty = '';
        if (grade <= 3) {
            grade = 'grade ' + grade;
            difficulty = 'trivial';
        } else if (grade <= 5) {
            grade = 'grade ' + grade;
            difficulty = 'very easy';
        } else if (grade <= 6) {
            grade = 'grade ' + grade;
            difficulty = 'easy';
        } else if (grade <= 7) {
            grade = 'grade ' + grade;
            difficulty = 'fairly easy';
        } else if (grade <= 9) {
            grade = 'grade ' + grade;
            difficulty = 'about normal';
        } else if (grade <= 12) {
            grade = 'grade ' + grade;
            difficulty = 'fairly difficult';
        } else if (grade <= 16) {
            grade = 'college undergraduate';
            difficulty = 'difficult';
        } else {
            grade = 'college post-graduate';
            difficulty = 'very difficult';
        }

        // Get the overall results 
        Readability.render('results', 'Readablity is ' + difficulty + ' (~ ' + grade + ')');

        // Now show all the individual indexes we used
        var systems = [];
        systems.push('Gunning-Fog:' + stats.gf.sig(1));
        systems.push('SMOG:' + stats.smog.sig(1));
        systems.push('Automated Readability:' + stats.ari.sig(1));
        systems.push('Coleman-Liau:' + stats.cl.sig(1));
        systems.push('Flesch-Kincaid(grade):' + + stats.fk.sig(1));
        //systems.push('Flesch-Kincaid index: ' + + stats.fe.sig(1));
        Readability.render('indexes', 'Indexes used: ' + systems.join(', '));
    }

    Readability.render('time', '(Processed in ' + Readability.elapsed().sig(2) + ' secs)');
}

Readability.main();
