#!/usr/local/bin/python2.3 -O
# -*- coding: iso-8859-1 -*-
"""check HTML pages for broken links"""
# Copyright (C) 2000-2004  Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

# imports and checks
import sys
if not hasattr(sys, 'version_info') or sys.version_info<(2, 3, 0, 'final', 0):
    raise SystemExit, "This program requires Python 2.3 or later."

import getopt, re, os, pprint, socket, linkcheck
# set default 60 seconds timeout
default_timeout = 60
socket.setdefaulttimeout(default_timeout)
# import several helper debugging things
from linkcheck.debug import *
from linkcheck.log import LoggerKeys
from linkcheck import StringUtil, Config, i18n
# some default values
_profile = "linkchecker.prof"
_username = "anonymous"
_password = "guest@"

# main usage text
Usage = i18n._("""USAGE\tlinkchecker [options] file-or-url...
""")

Notes = i18n._("""NOTES
o A ! before any regex negates it. So '!^mailto:' matches everything but
  a mailto link.
o LinkCheckers commandline parser treats "ftp." links like "ftp://ftp."
  and "www." links like "http://www.".
  You can also give local files as arguments.
o If you have your system configured to automatically establish a
  connection to the internet (e.g. with diald), it will connect when
  checking links not pointing to your local host.
  Use the -s and -i options to prevent this.
o Javascript links are currently ignored.
o If your platform does not support threading, LinkChecker uses -t0.
o You can supply multiple user/password pairs in a configuration file.
o To use proxies set $http_proxy, $https_proxy, $ftp_proxy, $gopher_proxy
  on Unix or Windows.
  On a Mac use the Internet Config.
o When checking 'news:' links the given NNTP host doesn't need to be the
  same as the host of the user browsing your pages!
""")

Examples = i18n._(r"""EXAMPLES
Check the treasure.calvinsplayground.de domain, but don't recurse into
links outside of this domain:
  linkchecker -v --status -r5 -Fhtml -itreasure\.calvinsplayground\.de \
  -ocolored http://treasure.calvinsplayground.de/

Don't connect to mailto: hosts, only check their URL syntax. All other
links are checked as usual:
  linkchecker --intern='!^mailto:' --strict www.mysite.org

Checking a local HTML file on Unix:
  linkchecker ../bla.html

Checking a local HTML file on Windows:
  linkchecker c:\temp\test.html

You can skip the "http://" url part if the domain starts with "www.":
  linkchecker www.myhomepage.de

You can skip the "ftp://" url part if the domain starts with "ftp.":
  linkchecker -r0 ftp.linux.org
""")

def printVersion ():
    """print the program version and exit"""
    print linkcheck.Config.AppInfo
    sys.exit(0)

def printUsage (msg):
    """print a program msg text to stderr and exit"""
    sys.stderr.write(i18n._("Error: %s\n") % msg)
    sys.stderr.write(i18n._("Execute 'linkchecker -h' for help\n"))
    sys.exit(1)


def viewprof ():
    """print profiling data and exit"""
    if not os.path.exists(_profile):
        sys.stderr.write(i18n._("Could not find profiling file %s.")%_profile)
        sys.stderr.write(i18n._("Please run linkchecker with --profile to generate it."))
        sys.exit(1)
    import pstats
    stats = pstats.Stats(_profile)
    stats.strip_dirs().sort_stats("cumulative").print_stats(50)
    sys.exit(0)

# Read command line arguments
from optparse import OptionParser, OptionGroup

class LCOptionParser (OptionParser):

    def error (self, msg):
        printUsage(msg)

    def get_usage (self):
        return Usage

    def print_help (self, file=None):
        s = "%s\n%s\n%s"%(self.format_help(), Notes, Examples)
        if os.name!='posix':
            StringUtil.paginate(s)
        else:
            print s
        sys.exit(0)

optparser = LCOptionParser()

################# general options ##################
group = OptionGroup(optparser, i18n._("General options"))
group.add_option("-f", "--config", type="string", dest="configfile",
                 help=i18n._(
"""Use file as configuration file. As default LinkChecker first
searches /etc/linkcheckerrc and then ~/.linkcheckerrc
(under Windows <path-to-program>\\linkcheckerrc)."""))
group.add_option("-I", "--interactive", action="store_true", dest="interactive",
                 help=i18n._(
"""Ask for url if none are given on the commandline."""))
group.add_option("-t", "--threads", type="int", dest="threads",
                 help=i18n._(
"""Generate no more than num threads. Default number of threads is 5.
To disable threading specify a non-positive number."""))

group.add_option("-V", "--version", action="store_true", dest="version",
                 help=i18n._(
"""Print version and exit."""))
optparser.add_option_group(group)


################# output options ##################
group = OptionGroup(optparser, i18n._("Output options"))
group.add_option("-v", "--verbose", action="store_true", dest="verbose",
                 help=i18n._(
"""Log all checked URLs (implies -w). Default is to log only invalid
URLs."""))
group.add_option("-w", "--warnings", action="store_true", dest="warnings",
                 help=i18n._("""Log warnings."""))
group.add_option("-W", "--warning-regex", type="string", dest="warningregex",
                 help=i18n._(
"""Define a regular expression which prints a warning if it matches
any content of the checked link.
This applies of course only to pages which are valid, so we can
get their content.
Use this to check for pages that contain some form of error
message, for example 'This page has moved' or 'Oracle
Application Server error'.
This option implies -w."""))
group.add_option("--warning-size-bytes", dest="warningsizebytes",
                 help=i18n._(
"""Print a warning if content size is available and exceeds the given
number of bytes. This option implies -w."""))
group.add_option("-q", "--quiet", action="store_true", dest="quiet",
                 help=i18n._(
"""Quiet operation. This is only useful with -F."""))
group.add_option("-o", "--output", type="string", dest="output",
                 help=i18n._(
"""Specify output as %s. Default output type is text.""")%LoggerKeys)
group.add_option("-F", "--file-output", type="string", action="append",
                 dest="fileoutput", help=i18n._(
"""type[/filename]
Output to a file linkchecker-out.<type>, $HOME/.linkchecker_blacklist for
'blacklist' output, or <filename> if specified.
The <filename> part of the 'none' output type will be ignored,
else if the file already exists, it will be overwritten.
You can specify this option more than once. Valid file output types
are %s.
Default is no file output. If console output is not specified with -o,
this option suppresses all console output by implying -o none.""")%LoggerKeys)
group.add_option("-D", "--debug", action="count",
                 help=i18n._(
"""Print debugging information. Provide this option multiple times
for even more debugging information."""))
group.add_option("--status", action="store_true", dest="status",
                 help=i18n._(
"""Print check status every 5 seconds to stderr."""))
group.add_option("--profile", action="store_true", dest="profile",
                 help=i18n._(
"""Write profiling data into a file named %s in the
current working directory. See also --viewprof.""")%_profile)
group.add_option("--viewprof", action="store_true", dest="viewprof",
                 help=i18n._(
"""Print out previously generated profiling data. See also --profile."""))
optparser.add_option_group(group)


################# checking options ##################
group = OptionGroup(optparser, i18n._("Checking options"))
group.add_option("-r", "--recursion-level", type="int", dest="recursionlevel",
                 help=i18n._(
"""Check recursively all links up to given depth. A negative depth
will enable inifinite recursion. Default depth is 1."""))
group.add_option("-e", "--extern", type="string", action="append", dest="extern",
                 help=i18n._(
"""Assume urls that match the given expression as external.
Only internal HTML links are checked recursively."""))
group.add_option("-i", "--intern", type="string", action="append", dest="intern",
                 help=i18n._(
""" regex, --intern=regex
Assume URLs that match the given expression as internal.
LinkChecker descends recursively only to internal URLs, not to
external."""))
group.add_option("-d", "--denyallow", action="store_true", dest="denyallow",
                 help=i18n._(
"""Swap checking order to external/internal. Default checking order
is internal/external."""))
group.add_option("-s", "--strict", action="store_true", dest="strict",
                 help=i18n._(
"""Check only syntax of external links, do not try to connect to them.
For local file urls, only local files are internal. For
http and ftp urls, all urls at the same domain name are internal."""))
group.add_option("-C", "--cookies", action="store_true", dest="cookies",
                 help=i18n._(
"""Accept and send HTTP cookies according to RFC 2109. Only cookies
which are sent back to the originating server are accepted.
Sent and accepted cookies are provided as additional logging
information."""))
group.add_option("-a", "--anchors", action="store_true", dest="anchors",
                 help=i18n._(
"""Check HTTP anchor references. This option applies to both internal
and external urls. Default is don't check anchors.
This option implies -w because anchor errors are always warnings."""))
group.add_option("--no-anchor-caching", action="store_false", dest="anchorcaching",
                 help=i18n._(
"""Treat url#anchora and url#anchorb as equal on caching. This
is the default browser behaviour, but it's not specified in
the URI specification. Use with care."""))
group.add_option("-u", "--user", type="string", dest="username",
                 help=i18n._(
"""Try given username for HTTP and FTP authorization.
Default is %r. See also -p.""")%_username)
group.add_option("-p", "--password", type="string", dest="password",
                 help=i18n._(
"""Try given password for HTTP and FTP authorization.
Default password is %r. See also -u.""")%_password)
group.add_option("--timeout", type="int", dest="timeout",
                 help=i18n._(
"""Set the timeout for TCP connection attempts in seconds. The default
timeout is %d seconds.""") % default_timeout)
group.add_option("-P", "--pause", type="int", dest="pause",
                 help=i18n._(
"""Pause <secs> seconds between each url check. This option implies -t0.
Default is no pause between requests."""))
group.add_option("-N", "--nntp-server", type="string", dest="nntpserver",
                 help=i18n._(
"""Specify an NNTP server for 'news:...' links. Default is the
environment variable NNTP_SERVER. If no host is given,
only the syntax of the link is checked."""))
optparser.add_option_group(group)


################# deprecated options ##################
group = OptionGroup(optparser, i18n._("Deprecated options"))
group.add_option("-R", "--robots-txt", action="store_true")
optparser.add_option_group(group)

################# auto completion #####################
from linkcheck import optcomplete
optcomplete.autocomplete(optparser)

if "--wischiwaschi" in sys.argv:
    from linkcheck import util1
    util1.abbuzze()
    sys.exit(0)

(options, args) = optparser.parse_args()

# set debug level as early as possible
if options.debug is not None:
    set_debuglevel(options.debug)
debug(BRING_IT_ON, "Python", sys.version, "on", sys.platform)
# config object
config = linkcheck.Config.Configuration()
# read configuration from config files
configfiles = []
if options.configfile:
    configfiles.append(options.configfile)
config.read(configfiles)
# apply commandline options and arguments
constructauth = False
do_profile = False
if options.anchors is not None:
    config["anchors"] = options.anchors
    config["warnings"] = True
if options.extern:
    config["externlinks"].extend([linkcheck.getLinkPat(arg) for arg in options.extern])
if options.output:
    if linkcheck.log.Loggers.has_key(options.output):
        config['log'] = config.newLogger(options.output)
    else:
        printUsage(i18n._("Illegal argument %r for option %s") % \
                   (options.output, "'-o, --output'"))
if options.fileoutput:
    ns = {'fileoutput': 1}
    for ftype in options.fileoutput:
        try:
            ftype, ns['filename'] = ftype.split('/', 1)
            if not ns['filename']: raise ValueError
        except ValueError:
            pass
        if linkcheck.log.Loggers.has_key(ftype):
            config['fileoutput'].append(config.newLogger(ftype, ns))
        else:
            printUsage(i18n._("Illegal argument %r for option %s") % \
                       (ftype, "'-F, --file-output'"))
    if not options.output:
        config['log'] = config.newLogger('none')
if options.interactive is not None:
    config['interactive'] = options.interactive
if options.intern:
    config["internlinks"].extend([linkcheck.getLinkPat(arg) for arg in options.intern])
if options.denyallow is not None:
    config["denyallow"] = options.denyallow
if options.nntpserver:
    config["nntpserver"] = options.nntpserver
if options.anchorcaching is not None:
    config["anchorcaching"] = options.anchorcaching
if options.password is not None:
    _password = options.password
    constructauth = True
if options.pause is not None:
    if options.pause >= 0:
        config["wait"] = options.pause
    else:
        printUsage(i18n._("Illegal argument %d for option %s") % \
                   (options.pause, "'-P, --pause'"))
if options.profile is not None:
    do_profile = options.profile
if options.quiet is not None:
    config["quiet"] = options.quiet
if options.recursionlevel is not None:
    config["recursionlevel"] = options.recursionlevel
if options.strict is not None:
    config["strict"] = options.strict
if options.status is not None:
    config['status'] = options.status
if options.threads is not None:
    config.setThreads(options.threads)
if options.timeout is not None:
    if options.timeout > 0:
        socket.setdefaulttimeout(options.timeout)
    else:
        printUsage(i18n._("Illegal argument %r for option %s") % \
                   (options.timeout, "'--timeout'"))
if options.username is not None:
    _username = options.username
    constructauth = True
if options.version is not None:
    printVersion()
if options.verbose is not None:
    if options.verbose:
        config["verbose"] = True
        config["warnings"] = True
if options.viewprof:
    viewprof()
if options.warnings is not None:
    config["warnings"] = options.warnings
if options.warningregex is not None:
    config["warningregex"] = re.compile(options.warningregex)
    config["warnings"] = True
if options.warningsizebytes is not None:
    config["warnsizebytes"] = options.warningsizebytes
if options.cookies is not None:
    config['cookies'] = options.cookies
if constructauth:
    config["authentication"].insert(0, {'pattern': re.compile(".*"),
                                        'user': _username,
					'password': _password})

debug(HURT_ME_PLENTY, "configuration:", pprint.pformat(config.items()))

# interactive input
if len(args)==0:
    if config['interactive']:
        urls = raw_input(i18n._("enter one or more urls, separated by white-space\n--> "))
        args = urls.split()
    else:
        warn(i18n._("no files or urls given"))

# syntactic sugar
from linkcheck import UrlData
for url in args:
    url = url.strip()
    if ":" not in url:
        if url.startswith("www."):
            url = "http://%s"%url
        elif url.startswith("ftp."):
            url = "ftp://%s"%url
    config.appendUrl(UrlData.GetUrlDataFrom(url, 0, config, cmdline=True))

############################# check the urls ################################
if do_profile:
    import profile
    profile.run("linkcheck.checkUrls(config)", _profile)
else:
    # do not use psyco, at the moment (Oct 2003) it has bugs causing
    # infinite loops when threads are enabled, and psyco disables
    # the Ctrl-C break button of the Python interpreter.
    #try:
    #    import psyco
    #    psyco.full()
    #except ImportError:
    #    pass
    linkcheck.checkUrls(config)
#############################################################################

# interactive input end
if config['interactive']:
    raw_input(i18n._("Hit RETURN to finish"))
