<?php
/*
 * $Horde: imp/lib/MIME/Viewer/html.php,v 1.4.2.11 2003/01/03 13:23:14 jan Exp $
 *
 * Copyright 1999-2003 Anil Madhavapeddy <anil@recoil.org>
 * Copyright 1999-2003 Jon Parise <jon@recoil.org>
 *
 * See the enclosed file COPYING for license information (GPL). If you
 * did not receive this file, see http://www.fsf.org/copyleft/gpl.html.
 */

/**
 * The IMP_MIME_Viewer_html class renders out plain text with
 * URLs made into hyperlinks.
 *
 * @author  Anil Madhavapeddy <anil@recoil.org>
 * @author  Jon Parise <jon@horde.org>
 * @version $Revision: 1.4.2.11 $
 * @since   IMP 3.0
 * @package horde.mime.viewer
 */

class IMP_MIME_Viewer_html extends MIME_Viewer {

    /**
     * Render out the currently set contents in HTML format.
     * The $mime_part class variable has the information to render
     * out, encapsulated in a MIME_Part object.
     */
    function render(&$mime)
    {
        $data = $this->mime_part->getContents();

        /* These regular expressions attempt to make html safe for
         * viewing. THEY ARE NOT PERFECT. If you enable html viewing, you
         * are opening a security hole. With the current state of the web,
         * I believe that the best we can do is to make sure that people
         * _know_ html is a security hole, clean up what we can, and leave
         * it at that. */

        /* Deal with <base> tags in the HTML, since they will screw up our
           own relative paths. */
        if (($i = stristr($data, '<base ')) && ($i = stristr($i, 'http')) &&
            ($j = strchr($i, '>'))) {
            $base = substr($i, 0, strlen($i) - strlen($j));
            $base = preg_replace('|(http.*://[^/]*/?).*|i', '\1', $base);

            if ($base[strlen($base) - 1] != '/') {
                $base .= '/';
            }
        }

        /* Nuke non-printable characters (a play in three acts). */
        // Rule #1: If we have a semicolon, it's deterministically detectable
        // and fixable, without introducing collateral damage.
        $data = preg_replace('/&#x?0*([9A-D]|1[0-3]);/i', '&nbsp;', $data);

        // Rule #2: Hex numbers (usually having an x prefix) are also
        // deterministic, even if we don't have the semi.  Note that some
        // browsers will treat &#a or &#0a as a hex number even without the x
        // prefix; hence /x?/ which will cover those cases in this rule.
        $data = preg_replace('/&#x?0*[9A-D]([^0-9A-F]|$)/i', '&nbsp\\1', $data);

        // Rule #3: Decimal numbers without semi.  The problem is that
        // some browsers will interpret &#10a as "\na", some as "&#x10a" so we
        // have to clean the &#10 to be safe for the "\na" case at the expense
        // of mangling a valid entity in other cases.  (Solution for valid HTML
        // authors: always use the semicolon.)
        $data = preg_replace('/&#0*(9|1[0-3])([^0-9]|$)/i', '&nbsp\\2', $data);

        /* Get all attribute="javascript:foo()" tags. */
        /* This is essentially the regex /=("?)[^>]*script:/ but expanded */
        /* to catch camouflage with spaces and entities. */
        $preg = '/(&#0*61;?|&#x0*3D;?|=)\s*(&#0*34;?|&#x0*22;?|")?[^>]*\s*(&#0*115;?|&#x0*73;?|s)\s*(&#0*99;?|&#x0*63;?|c)\s*(&#0*114;?|&#x0*72;?|r)\s*(&#0*105;?|&#x0*69;?|i)\s*(&#0*112;?|&#x0*70;?|p)\s*(&#0*116;?|&#x0*74;?|t)\s*(&#0*58;?|&#x0*3A;?|:)/i';
        $data = preg_replace($preg, '=\2cleaned', $data);

        /* Get all on<foo>="bar()" tags. */
        $data = preg_replace('/(\s+[Oo][Nn]\w+)=/', '\1Cleaned=', $data);

        /* Get all tags that might cause trouble - <script>, <embed>,
           etc. Meta refreshes and iframes, too. */
        $malicious = array('|<([^>]*)s\s*c\s*r\s*i\s*p\s*t|i',
               '|<([^>]*)embed|i',
               '|<([^>]*)meta|i',
               '|<([^>]*)j\sa\sv\sa|i',
               '|<([^>]*)object|i',
               '|<([^>]*)iframe|i',
               '|<(\s*)style|i');
        $data = preg_replace($malicious, '<cleaned_tag', $data);

        /* A few other matches. */
        $data = preg_replace('|<([^>]*)&{.*}([^>]*)>|', '<&{;}\3>', $data);
        $data = preg_replace('|<([^>]*)mocha:([^>]*)>|i', '<cleaned\2>', $data);

        /* Attempt to fix paths that were relying on a <base> tag. */
        if (!empty($base)) {
            $data = preg_replace('|src="/|i', 'src="' . $base, $data);
            $data = preg_replace('|src=\'/|i', 'src=\'' . $base, $data);
            $data = preg_replace('|src=[^\'"]/|i', 'src=' . $base, $data);

            $data = preg_replace('|href= *"/|i', 'href="' . $base, $data);
            $data = preg_replace('|href= *\'/|i', 'href=\'' . $base, $data);
            $data = preg_replace('|href= *[^\'"]/|i', 'href=' . $base, $data);
        }

        /* Search for inlined images that we can display. */
        global $MimeID, $imp;
        if (isset($MimeID)) {
            foreach ($MimeID as $ref => $id) {
                if (strlen($id) > 0) {
                    if ($id[0] == '<') $id = substr($id, 1);
                    if ($id[strlen($id) - 1] == '>') $id = substr($id, 0, strlen($id)-1);
                    $data = str_replace("cid:$id", Horde::url('view.php?actionID=' . VIEW_ATTACH . '&index=' . $mime->index . '&mailbox=' . urlencode($imp['mailbox']) . '&thismailbox=' . urlencode($imp['thismailbox']) . '&id=' . $ref), $data);
                }
            }
        }

        // Convert links to open in new windows.
        // But first we hide all links that have an "#xyz" anchor.
        $data = preg_replace('|<a([^>]*href=["\']?#)|i', "<\x01\\1", $data);
        $data = str_replace('<a', '<a target="_blank"', $data);
        $data = str_replace('<A', '<a target="_blank"', $data);
        $data = preg_replace("|\x01|", 'a', $data);

        return $data;
    }

    /**
     * Return text/html as the content-type
     *
     * @return string  'text/html' constant.
     */
    function getType()
    {
        return 'text/html';
    }

}
