######################################################################
# regular expressions used by ZWiki

import re, string
import Defaults

# URLs/URIs (better regexps in urllib/urlparse ?)
urlchars         = r'[A-Za-z0-9/:@_%~#=&\.\-\?\+\$,]+'
urlendchar       = r'[A-Za-z0-9/]'
url              = r'["=]?((about|gopher|http|https|ftp|mailto|file):%s)' % \
                   (urlchars)

# valid characters for zwiki page ids
# These are the characters which are used to form safe page ids for both
# free-form names and wiki names.  They are the characters legal in both
# zope ids and urls, excluding _ which we use for quoting. (See
# canonicalIdFrom).
# You have a choice here -
# 1. Don't allow international characters in ids.
zwikiidcharsexpr = re.compile(r'[a-zA-Z0-9.-]')
# 2. Allow (single-byte) international characters in page ids.
# You also need to hack zope's OFS.ObjectManager.bad_id, eg:
## bad_id = re.compile(r'[^\xC0-\xFFa-zA-Z0-9-_~,.$# ]').search
# what's the thread-safety issue noted there ?
# extract zopeidchars from bad_id - hacky:
#from OFS.ObjectManager import bad_id
#try:
#    zopeidchars = re.sub(r'\^',r'',bad_id.__self__.pattern)
#    zopeidchars = re.sub(r'\\\(',r'(',zopeidchars)
#    zopeidchars = re.sub(r'\\\)',r')',zopeidchars)
#except AttributeError:
#    # older zope uses ts_regex
#    zopeidchars = re.sub(r'\^',r'',bad_id.im_self.givenpat)
#    zopeidchars = re.sub(r'\\\,',r',',zopeidchars)
#    zopeidchars = re.sub(r'\\\.',r'.',zopeidchars)
#zwikiidcharsexpr = re.compile(re.sub(r'[_~,$()# ]',r'',zopeidchars))

# also used in generating page ids
# XXX NB this is affected by locale - may not be what we want
spaceandlowerexpr = re.compile(r'\s+([%s])'%(string.lowercase))

# zwiki free-form links
# zwiki uses [...] to define free-form page names. These can be almost
# anything (on a single line).
# group 1 should be what's inside the brackets
bracketedexpr    = r'\[([^\n\]]+)\]'

# zwiki wiki links
# Zwiki's WikiNames allow words of a single letter (APage, PageA) and
# trailing digits (PageVersion002).
#
# WikiNames can also contain (single-byte) international characters,
# usually depending on your locale.  The code below tries to enable them
# out of the box for most setups. You can set a specific (installed)
# locale here with ZWIKI_LOCALE, or leave it as None to use the default
# system locale.  If no particular locale appears to be set, we hard-code
# the characters and emulate the \b pattern. In this case word boundaries
# may not be recognized quite as well.

# Not hugely tested. Please report problems and successes at
# http://zwiki.org/ZwikiTracker and
# http://zwiki.org/InternationalCharactersInPageNames .
ZWIKI_LOCALE = None
import locale
if ZWIKI_LOCALE:
    loc = locale.setlocale(locale.LC_ALL, ZWIKI_LOCALE) 
elif hasattr(locale,'getdefaultlocale'):
    loc = locale.getdefaultlocale()
else:
    loc = (None,None)
if loc != (None,None):
    U = string.uppercase                   
    L = string.lowercase
    wikiname1        = r'(?L)\b[%s]+[%s]+[%s][%s]*[0-9]*' % (U,L,U,U+L)
    wikiname2        = r'(?L)\b[%s][%s]+[%s][%s]*[0-9]*'  % (U,U,L,U+L)
else:
    U = 'A-Z\xc0-\xdf'
    L = 'a-z\xe0-\xff'
    b = '(?<![%s0-9])' % (U+L)
    wikiname1        = r'(?L)%s[%s]+[%s]+[%s][%s]*[0-9]*' % (b,U,L,U,U+L)
    wikiname2        = r'(?L)%s[%s][%s]+[%s][%s]*[0-9]*'  % (b,U,U,L,U+L)

simplewikilink   = r'!?(%s|%s|%s)' % (wikiname1, wikiname2, url)
wikilink         = r'!?(%s|%s|%s|%s)' % (wikiname1,wikiname2,bracketedexpr,url)
localwikilink    = r'!?(%s|%s|%s)' % (wikiname1,wikiname2,bracketedexpr)
interwikilink    = r'!?((?P<local>%s):(?P<remote>%s))' % \
                   (localwikilink, urlchars+urlendchar)
anywikilinkexpr  = re.compile(r'(%s|%s)' % (interwikilink,wikilink))
untitledwikilinkexpr = \
          re.compile(r'<a href="([^"/]*/)*(?P<page>[^/"]*)" title="">.*?</a>')
remotewikiurl    = r'(?mi)RemoteWikiURL[:\s]*([^\s]+)\s*$'
protected_line   = r'(?m)^!(.*)$'

# stx footnotes 
# handled by us so as to co-exist with our bracketed links
# real stx allows refchars = r'[0-9_%s-]' % (string.letters)
footnoteexpr     = r'(?sm)^\.\. \[([^\n\]]+)\]'

# for stripping javascript
# XXX needs work, eg should not match
# <input... name="ZPythonScriptHTML_editAction:method">
javascriptexpr   = r'(?iL)<([^>\w]*script[^>]*)>' # \1 will be displayed

# for stripping HTML header/footer
# XXX strip these from the middle of pages too ?
htmlheaderexpr = r'(?si)^(\s*<!doctype.*?)?\s*<html.*?<body.*?>'
htmlfooterexpr = r'(?si)</body.*?>\s*</html.*?>\s*$'

# sgml tags, including those containing dtml/python and multi-line
# XXX needs more work, does not match all tags
#
# one badass regexp
#
#r'(?s)<((".*?")|[^">]+)*>'          # takes exponential time
#r'(?s)<((".*?")|[^">]+(?![^">]))*>' # avoids backtracking (see perlre)
# to avoid matching casual angle bracket use, treat dtml separately
# recognizing that stuff like <!-- dtml-var ...> & </dtml ...> is also dtml
# and that a simple sgml tag may contain a dtml tag
# put dtml pattern first, longest match does not apply with (|) I think
try: # work with different zope versions                                  
    # copied from doc_sgml()
    import StructuredText
    simpletagchars = \
      r'[%s0-9\.\=\'\"\:\/\-\#\+\s\*\;\!\&\-]' % StructuredText.STletters.letters
except AttributeError: # older zope
    simpletagchars = r'[A-z0-9\.\=\'\"\:\/\-\#\+\s\*\;\!\&\-]'
dtmltag = r'(?si)<[-/! ]*dtml((".*?")|[^">]+(?![^">]))*>'
dtmlentity = r'(?i)&dtml.*?;'
#simplesgmltag = r'<((".*?")|%s+)>' % simpletagchars
simplesgmltag = r'<((".*?")|(%s)|%s+(?!%s))>' % (dtmltag,simpletagchars,simpletagchars)
dtmlorsgmlexpr = r'(%s|%s|%s)' % (dtmltag,simplesgmltag,dtmlentity)

# From_ separator used to recognize rfc2822 messages - regexp from mailbox.py
# used in Messages.py
fromlineexpr = r'(?m)\n\nFrom \s*[^\s]+\s+\w\w\w\s+\w\w\w\s+\d?\d\s+\d?\d:\d\d(:\d\d)?(\s+[^\s]+)?\s+\d\d\d\d\s*$'

# NIDs embedded in page source
# used in PurpleNumbers.py
nidexpr = r'\s*({nid (?P<nid>[0-9A-z]+?)})'

