#
# WEBMIRROR
#
#
# Written by Renaud Deraison <deraison@nessus.org>
#
#
# This plugin mirrors the paths used by a website. We typically care
# to obtain the list of CGIs installed on the remote host, as well as
# the path they are installed under. 
#
# Note that this plugin does not properly check for the syntax of the
# HTML pages returned : it tries to extract as much info as it
# can. We don't care about the pages extensions either (but we do
# case about the mime types)
#
# This plugin takes a really long time to complete, so it updates
# the KB as soon as data is found (as it's likely to be killed
# by nessusd against huge sites)
#
#
# Credit goes to webmirror.pl by t-0micr0n - mostly for the idea of 
# saving time and code complexity by not checking if the HTML syntax is 
# correctly respected
#
#
#
# This plugin only works properly with the libnasl 1.1.x and newer
#
# See the Nessus Scripts License for details
#

if(description)
{
 script_id(10662);
 script_version ("$Revision: 1.22 $");
 
 name["english"] = "Web mirroring";
 name["francais"] = "Web mirroring";
 script_name(english:name["english"], francais:name["francais"]);
 
 desc["english"] = "
This script makes a mirror of the remote web site(s)
and extracts the list of CGIs that are used by the remote
host.

It is suggested you give a high timeout value to
this plugin and that you change the number of
pages to mirror in the 'Options' section of
the client.

Risk factor : None";

 script_description(english:desc["english"]);
 
 summary["english"] = "Performs a quick web mirror";
 summary["francais"] = "Effectue un miroir rapide de site web";
 
 script_summary(english:summary["english"], francais:summary["francais"]);
 
 script_category(ACT_GATHER_INFO);
 
 
 script_copyright(english:"This script is Copyright (C) 2001 Renaud Deraison",
		francais:"Ce script est Copyright (C) 2001 Renaud Deraison");
 family["english"] = "CGI abuses";
 family["francais"] = "Abus de CGI";
 script_family(english:family["english"], francais:family["francais"]);
 script_dependencie("find_service.nes", "httpver.nasl");
 script_require_ports("Services/www", 80);
 script_add_preference(name:"Number of pages to mirror : ",
 			type:"entry",
			value:"10");
 script_add_preference(name:"Start page : ",
 			type:"entry",
			value:"/");			
 exit(0);
}


	     
#------------------------------------------------------#
# Globals                                              #
#------------------------------------------------------#

hostname = get_host_name();
start_page = script_get_preference("Start page : ");
if(!start_page)start_page = "/";

port = get_kb_item("Services/www");
if(!port) port = 80;

#
# Is there a service running ?
#
if(!get_port_state(port))exit(0);
soc = open_sock_tcp(port);
if(!soc)exit(0);


max_pages = script_get_preference("Number of pages to mirror : ");
#max_pages = 40;
quote = raw_string(0x22);

debug = 0;

urls[0] = start_page;
num_urls = 1;

cgis[0] = "";
num_cgis = 0;


args[0] = "";
num_args = 0;

url200[0] = "";
num_200 = 0;

url404[0] = "";
num_404 = 0;

cur_args = "";
cur_cgi = 0;


___saveme = "";
level = 0;
num_extensions = 1;
extensions[0] = "html";
pages["html"] = 0;


#------------------------------------------------------#
# Keep track of valid & invalid urls                   #
#------------------------------------------------------#
function add_200(url)
{ 
  url200[num_200] = url;
  num_200 = num_200 + 1;
  
  #
  # We put the valid URLs in the KB, so that they can be re-used
  # by other plugins.
  #
  if(debug)display(string("www/", port, "/content/pages/", num_200), "=" ,url, "\n");
  set_kb_item(name:string("www/", port, "/content/pages/", num_200), value:url);
  
  ___url = ereg_replace(string:url,
  			pattern:"(.*)\?.*",
			replace:"\1");
  if("." >< ___url)
  {
  if(!ereg(pattern:"^.*/$", string:___url))
  {
  xt = ereg_replace(pattern:".*\.([^\.]*)$",
  		     string:___url,
		     replace:"\1");
   if(!("/" >< xt))
    {
     num = pages[xt];
     pages[xt] = num + 1;
     set_kb_item(name:string("www/", port, "/content/extensions/", xt, "/",
     		pages[xt]), value:url);
     }
    }
   }	     
}

function add_404(url)
{
  url404[num_404] = url;
  num_404 = num_404 + 1;
  # set_kb_item ?
}

#------------------------------------------------------#
# Add a CGI to our array of CGIs to visit              #
#------------------------------------------------------#
function add_cgi(cgi)
{
  #
  # Replace http://blah/tag by /tag
  # Skip the url if it's not the same host
  #
  if(ereg(pattern:".*http.*", 
	  string:cgi,
	  icase:1))
  {
    hn = ereg_replace(pattern:".*http://([^/]*)(/|$).*",
		     string:url,
		     icase:1,
		     replace:"\1");

    myname =  hostname;
    if(myname == hn)
    {
    cgi = ereg_replace(pattern:".*http://[^/]*(/.*)",
		     string:url,
		     icase:1,
		     replace:"\1");
    }
    else {
	return(0);
    }
  }
  
  
  for(_i = 0 ; _i < num_cgis ; _i = _i + 1)
  {
    if(cgi == cgis[_i])
      {
      #display("return ", cgi, "\n");
      return(cgi);
      }
  }

  if(debug)display("add cgi ", cgi, "\n");
  cgis[num_cgis] = cgi;
  num_cgis = num_cgis + 1;
  
  #
  # Put the CGI in the KB, so that it may be re-used by another
  # plugin.
  #
  set_kb_item(name:string("www/", port, "/content/cgi/", num_cgis), value:cgi);	
  #display("return ", cgi, "\n");
  return(num_cgis);
}

#------------------------------------------------------#
# Extracts a list of arguments from a 'cmd-line' arg   #
# (that is, /foo.php?id=a&name=b -> "id name"          #
#------------------------------------------------------#
function parse_cmdline_args(cmd)
{
 _ret = "";
 while(cmd)
 {
  _cmd = ereg_replace(string:cmd,
  		      pattern:"([^&]*)&.*",
		      replace:"\1");
  if(!_cmd)
  {
   cmd = 0;
  }
  else
  {		      
   _arg = ereg_replace(string:_cmd,
   			pattern:"([^=]*)=.*",
			replace:"\1");
			
   _val = ereg_replace(string:_cmd,
   			pattern:"([^=]*)=([^&]*).*",
			replace:"\2");
   if((_val == _cmd))_val = "";
   	
   if(!(string(_arg, " ") >< _ret))
   {
   _ret = string(_ret, _arg, " ");
   if(_val)
    _ret = string(_ret, "[", _val, "] ");
   }
  cmd = cmd - _cmd;	     
  } 
 }
 if(strlen(_ret) > 1)
 {
 return(_ret);
 }
 else return(0);
}


#------------------------------------------------------#
# Add an URL in our array of URLs to visit             #
#------------------------------------------------------#
function add_url(url)
{
  #
  # Replace http://blah/tag by /tag
  # Skip the url if it's not the same host
  #
  if(ereg(pattern:"^http.*$", 
	  string:url,
	  icase:1))
  {
    hn = ereg_replace(pattern:"^http://([^/]*)/.*$",
		     string:url,
		     icase:1,
		     replace:"\1");
    if(hn == url)return(0);		     
    rest = ereg_replace(pattern:"^http://[^/]*(/.*)$",
		     string:url,
		     icase:1,
		     replace:"\1");		     

    myname = get_host_name();
    if(!rest)
     return(0);
    if(myname == hn)
    {
      url = rest;
    }
    else {
	return(0);
    }
  }
  
  if(ereg(pattern:".*\?.*",
          string:url))
	  {
	   _cgi = ereg_replace(pattern:"([^?]*)\?.*",
	   		       replace:"\1",
			       string:url);
	   _args = ereg_replace(pattern:"[^?]*\?(.*)",
	   			replace:"\1",
				string:url);
	   _args = parse_cmdline_args(cmd:_args);
	   if(debug)display("call to add_cgi()\n");		     
	   xyz = add_cgi(cgi:_cgi);
	   if(xyz) args[xyz - 1] = _args;
	  }
 for(_i=0;_i<num_urls;_i = _i + 1)
 {
   if(url == urls[_i])
   {
     return(0);
   }
 }
 if(debug)display("**** ADD ", url, "\n");
 urls[num_urls] = url;
 num_urls = num_urls + 1;
}



#------------------------------------------------------#
# Extract an URL from an href= tag                     #
#------------------------------------------------------#
function extract_href(line, base)
{
  url = ereg_replace(pattern:".*< *(a|area|frame) *.* (href|src)=([^> ]*)>.*",
       	 	     string:line,
		     icase:1,
		     replace:"\3");
  if(url == line)return(0);


  while(quote >< url)
  {
    url = url - quote;
  }
  if(ereg(pattern:"^[a-z]+://.*", string:url,
	  icase:1)){
    	if(!(ereg(pattern:"^http://.*", string:url, icase:1)))return(0);
  }
  #
  # Suppress the dots-dots from a URL
  #
  num_dots = 0;
  orig = url;
  while("../" >< url)
  {
    url = url - "../";
    num_dots = num_dots + 1;
  }

  while("./" >< url)
  {
    url = url - "./";
  }

  base = ereg_replace(pattern:"(.*/)[^/]*$", string:base, replace:"\1");
  if(num_dots)
  {
    while(num_dots)
    {
      base = ereg_replace(pattern:"(.*/)[^/]*/$",string:base, replace:"\1");
      if(debug)display(base, "\n");
      num_dots = num_dots - 1;
    }
   url = base + url;
   if(debug)display("*** DOTS : ", orig, " -> ", url, "\n");
  }
  if(!(url[0] == "/"))
  {
    if(!(ereg(pattern:"[a-z]*://.*",
	    string:url,
            icase:1))) url = base + url;
  }

  if(ereg(pattern:".*#.*", string:url))
  {
   url = ereg_replace(pattern:"(.*)#.*", 
       		      string:url,
		      replace:"\1");
  }
  add_url(url:url);
}


#------------------------------------------------------#
# Extract an URL from a FORM tag                       #
#------------------------------------------------------#
function extract_form(line, base)
{

  cgi = ereg_replace(pattern:"^.*< *form .*action=([^> ]*).*>.*$",
      		     string:line,
		     icase:1,
 		     replace:"\1");

  if(cgi == line)return(0);
  while(quote >< cgi)
  {
    cgi = cgi - quote;
  }
  
  #
  # Suppress the dots-dots from a URL
  #
  num_dots = 0;
  orig = cgi;
  while("../" >< cgi)
  {
    cgi = cgi - "../";
    num_dots = num_dots + 1;
  }

  base = ereg_replace(pattern:"(.*/)[^/]*$", string:base, replace:"\1");
  if(num_dots)
  {
    while(num_dots)
    {
      base = ereg_replace(pattern:"(.*/)[^/]*/$",string:base, replace:"\1");
      if(debug)display(base, "\n");
      num_dots = num_dots - 1;
    }
   cgi = base + cgi;
   if(debug)display("*** DOTS : ", orig, " -> ", cgi, "\n");
  }
  if(!(cgi[0] == "/"))
  {
    if(!(ereg(pattern:"[a-z]*://.*",
	    string:cgi,
            icase:1))) cgi = base + cgi;
  }
  
  xyz = add_cgi(cgi:cgi);
  return(xyz);
}


function parse(data, curcgi)
{
 level = level + 1;
 if(curcgi){
 	___saveme  = curcgi;
	}
 sub = ereg_replace(pattern:"[^<]*(<[^>]*>).*",
       		  	  replace:"\1",
			  string:data);
			  
			  
 if(sub == data)
 {
   level = level - 1;
    return(0);
 }

 oldr = data;		  
 while(sub)
 {
  if(ereg(pattern:"href|frame|form|input|button|select|textarea",
	  string:sub,
	  icase:1))
  {
  if(ereg(pattern:".*(href|frame).*", string:sub, icase:1))
  {
   extract_href(line:sub, base:page);
  }
  else
  {       
  if(ereg(pattern:".*[^/]form.*", string:sub, icase:1))
  {
  _cgi = extract_form(line:sub, base:page);
  data = oldr - sub;
  parse(data:data, curcgi:_cgi);
  curcgi = _cgi;
  }
  else {
   if(ereg(pattern:".*(input|button).*", string:sub, icase:1))
   {
   #
   # Extract the input name
   #
    pt = string("^.*< *(input|button) .*name=",          
                quote,
                "([^",
                quote,
                "]*)",
                quote,
               ".*>.*$");
	       
   _name = ereg_replace(pattern:pt, replace:"\2", string:sub, icase:1);
   if((_name == sub))
   {	       
   _name = ereg_replace(pattern:"^.*< *(input|button) .*name=([^<> ]*).*>.*$",
  			replace:"\2",
			string:sub,
			icase:1);
   }

    _value = "";
   
    #
    # Extract the default value
    #
    pt = string("^.*< *(input|button) .*value=",          
                quote,
                "([^",
                quote,
                "]*)",
                quote,
               ".*>.*$");
    _value = ereg_replace(pattern:pt,
  			replace:"\2",
			string:sub,
			icase:1);	
    if((!strlen(_value)) || (_value == sub))
    {
    _value = ereg_replace(pattern:"^.*< *(input|button) .*value=([^<> ]*).*>.*$",
    			replace:"\2",
			string:sub,
			icase:1);		
     if((!strlen(_value)) || (_value == sub))_value = "";			
    }					
    if(!(_name == sub))
    {
    while(quote >< _name)
    _name = _name - quote;
   
    if(!(_value == string(quote, "0", quote)))
    {
    while(quote >< _value)
    _value = _value - quote;
    }

    if(!(string(_name, " ") >< cur_args)) 
    {
    cur_args = string(cur_args, _name, " ");
    if(strlen(_value))
     {
      if(_value == string(quote, "0", quote))_value = "0";
     cur_args = string(cur_args, "[", _value, "] ");
     }
    }
  }
 }
 else
 {
 if(ereg(pattern:".*[^/](select|textarea)[^e].*", string:sub, icase:1))
  {
  _name = ereg_replace(pattern:"^.*< *(select|textarea) .*name=([^<> ]*).*>.*$",
  			replace:"\2",
			string:sub,
			icase:1);
   while(quote >< _name)
   { 
    _name = _name - quote;
   }
   if(!(string(_name, " ") >< cur_args))
    cur_args = string(cur_args, _name, " ");
  }
 else
  { 
  if(ereg(pattern:".*\/form.*", string:sub, icase:1))
  {
   if(___saveme)
   { 
    if(strlen(cur_args) > 1)
     {
      args[___saveme - 1] = cur_args;
      }
     }
     ___saveme = "";
     cur_args = "";
     if(level > 1)
      {
       level = level - 1;
       return(0);
       }
      }
     }    
    }
   }
  }
 }
  oldr = oldr - sub;
  sub = ereg_replace(pattern:"^[^<]*(<[^>]*>).*$",
       		  	  replace:"\1",
			  string:oldr,
			  icase:1);
  if(sub == oldr)sub = 0;		  
  }
}

#------------------------------------------------------#
# Retrieve the page <page>                             #
#------------------------------------------------------#
function retr(page)
{
   if(debug)display("**** RETR ", page, "\n");
   soc = open_sock_tcp(port);
   if(soc)
   {
     str = http_get(item:page, port:port);
     send(socket:soc, data:str); 
     code = recv_line(socket:soc, length:4096);
     r = code;
    
     
     #
     # Page not found
     #
     if("404" >< code)
     {
       add_404(url:page);
       if(debug)display("*** ", page, " NOT FOUND\n");
       close(soc);
       return(0);
     }
     
     #
     # Redirection (code 301)
     #
     if("301" >< code)
     {
      while(!("Location" >< r))
        r = recv_line(socket:soc, length:4096);
      
     
      location = r - string("\r\n");
      location = ereg_replace(string:r, 
      			      pattern:"Location: (.*)$",
			      replace:"\1");
		      		      
       add_url(url:location);		
       close(soc);
       return(1);	      
     }
     
     #
     # Page found (code 200)
     #
     if("200" >< code)
     {
       add_200(url:page);
       while(strlen(r) > 2)
       {
         #
	 # We only treat text that appear as text/{html,xml}
	 # and we skip the rest
	 #
	 r = recv_line(socket:soc, length:4096);
	 if(ereg(pattern:"^Content-Type:.*$", string:r, icase:1)){
	   if(!ereg(pattern:".*text/(xml|html).*", string:r, icase:1))
	   {
	   if(debug)display("*** SKIPPING ", page, "\n");
	   close(soc);
	   return(0);
	   }
	 }
       }

       while(r)
       {
       r = recv_line(socket:soc, length:4096, timeout:2);
       level = 0;
       parse(data:r, curcgi:"");
       }
       close(soc);
       return(1);
     }
   }
   close(soc);
   return (0);
}






#
# By default, we don't retrieve more than max_pages files.
#
retr(page:start_page);
for(i=0;i<max_pages;i = i + 1)
{
  retr(page:urls[i]);
}

#
# Analyzing time...
#

if(0)
{
 for(i=0;i<num_200;i=i+1)
 {
  if(debug)display("LINK : ", url200[i], "\n");
 }

 for(i=0;i<num_404;i=i+1)
 {
  if(debug)display("BROKEN LINK : ", url404[i], "\n");
 }
}


#-------------------------------------------------------------------#
# Just report the CGIs used at this time. We'll improve that        #
# in a newer version.                                               #
#-------------------------------------------------------------------#
my_cgis = string("\nSyntax: cginame (arguments [default value])\n");
name = string("www/",port,"/cgis");
if(!num_cgis)exit(0);
for(i=0;i<num_cgis;i=i+1)
{
 if(args[i])
  {
   kb = string(cgis[i], " - ", args[i]);
   set_kb_item(name:name, value:kb);
   my_cgis = string(my_cgis, "\n", cgis[i]," ( ", args[i], ")");
  }
 else
  my_cgis = string(my_cgis, "\n", cgis[i]);
}


report = string("For your information, here is the list of CGIs\n",
"that are used by the remote host, as well as their arguments : \n",
my_cgis);

security_note(port:port, data:report);
