#!/usr/bin/env python3
# Why no XML parsing?
# Because this is problematic with attacks and incomplete HTML.
# We just take the very basics that can be assessed with regular expressions.


import sys
import re
import getopt
import os
import syslog
from urllib import parse, request
import urllib

reflags = re.DOTALL

# Remove <tag>...</tag> from filecontent.
# Return the number of removed tags, the accumulated removed tags and the filecontent without these tags.
def remove_tag(filecontent, tag):
    remcount = 0
    tagstartstr = "<%s[ ]*[^>]*>" % tag
    tagendstr = "</%s>[\s]\n*" % tag
    tagstart = re.compile(tagstartstr, flags=reflags)
    tagend = re.compile(tagendstr, flags=reflags)
    startmatch = tagstart.search(filecontent, 0)
    removedcontent = ""
    while startmatch is not None:
        endmatch = tagend.search(filecontent, startmatch.start())
        if endmatch is None:
            break
        removedcontent += filecontent[startmatch.start():endmatch.end()]
        remcount += 1
        filecontent = filecontent[:startmatch.start()] + filecontent[endmatch.end():]
        startmatch = tagstart.search(filecontent, startmatch.start())

    return(remcount, removedcontent, filecontent)


# Remove <tag /> when it has a given attr regular expression from filecontent.
# Return the number of found tags, the removed tags, the accumulated removed tags and the filecontent without these tags.
def remove_tag_attr(filecontent, tag, attr):
    tagstr = "<%s[ ]*[^>]*>[\s\n]*" % tag
    tagfull = re.compile(tagstr, flags=reflags)
    attrfull = re.compile(attr, flags=reflags)
    curpos = 0
    tagsfound = 0
    tagsremoved = 0
    fullmatch = tagfull.search(filecontent, curpos)
    removedcontent = ""
    while fullmatch is not None:
        tagsfound += 1
        if attrfull.search(fullmatch[0]) is None:
            fullmatch = tagfull.search(filecontent, fullmatch.end())
            continue
        removedcontent += filecontent[fullmatch.start():fullmatch.end()]
        filecontent = filecontent[:fullmatch.start()] + filecontent[fullmatch.end():]
        tagsremoved += 1
        fullmatch = tagfull.search(filecontent, fullmatch.start())

    return(tagsfound, tagsremoved, removedcontent, filecontent)


# Remove SVGs
def rem_svg(filecontent):
    svgcount, svgcontent, filecontent = remove_tag(filecontent, "svg")
    gcount, gcontent, filecontent = remove_tag(filecontent, "g")
    return(svgcount+gcount, svgcontent+gcontent, filecontent)


# Remove scripts
def rem_script(filecontent):
    scriptcount, scriptcontent, filecontent = remove_tag(filecontent, "script")
    return(scriptcount, scriptcontent, filecontent)


# Remove noscripts
def rem_noscript(filecontent):
    noscriptcount, noscriptcontent, filecontent = remove_tag(filecontent, "noscript")
    return(noscriptcount, noscriptcontent, filecontent)


# Remove styles
def rem_style(filecontent):
    stylecount, stylecontent, filecontent = remove_tag(filecontent, "style")
    return(stylecount, stylecontent, filecontent)


# Leave only known head tags.
def add_owncss(filecontent, owncss):
    # Get head.
    headstart = re.compile("<head[ ]*[^>]*>", flags=reflags)
    startmatch = headstart.search(filecontent, 0)
    if startmatch is not None:
        filecontent = filecontent[:startmatch.end()] + "<link rel='stylesheet' href='" + owncss + ".css' />" + filecontent[startmatch.end():]

    return(filecontent)


# Leave only known head tags.
def rem_head(filecontent):
    # Get head.
    headstart = re.compile("<head[ ]*[^>]*>", flags=reflags)
    headend = re.compile("</head>", flags=reflags)
    startmatch = headstart.search(filecontent, 0)
    if startmatch is not None:
        endmatch = headend.search(filecontent, startmatch.start())
    else:
        return("", "", filecontent)
    if endmatch is None:
        return("", "", filecontent)
    headstart = startmatch.end()
    headend = endmatch.start()
    head = filecontent[headstart:headend]


    # Remove links
    # https://developer.mozilla.org/en-US/docs/Web/HTML/Link_types
    linkremovals = []
    linkremovals.append("rel=['\"]*apple")
    linkremovals.append("rel=['\"]*msapplication")
    linkremovals.append("rel=['\"]*dns-prefetch")
    linkremovals.append("rel=['\"]*prefetch")
    linkremovals.append("rel=['\"]*prerender")
    linkremovals.append("rel=['\"]*preload")
    linkremovals.append("rel=['\"]*preconnect")
    linkremovals.append("rel=['\"]*canonical")
    linkremovals.append("rel=['\"]*external")
    linkremovals.append("rel=['\"]*manifest")
    linkremovals.append("rel=['\"]*modulepreload")
    linkremovals.append("rel=['\"]*stylesheet")

    alltagsfound = 0
    alltagsremoved = 0
    allremovedcontent = ""
    for removal in linkremovals:
        tagsfound, tagsremoved, removedcontent, head = remove_tag_attr(head, "link", removal)
        alltagsfound += tagsfound
        alltagsremoved += tagsremoved
        allremovedcontent += removedcontent


    # Remove meta tags.
    # https://developer.mozilla.org/en-US/docs/Web/HTML/Link_types
    metaremovals = []
    metaremovals.append("name=['\"]*twitter:")
    metaremovals.append("property=['\"]*og:")
    metaremovals.append("content=['\"]*og:")
    metaremovals.append("property=['\"]*fb:")
    metaremovals.append("name=['\"]*robots")
    metaremovals.append("name=['\"]*msapplication")
    metaremovals.append("name=['\"]*apple")
    metaremovals.append("name=['\"]*i18n")
    for removal in metaremovals:
        tagsfound, tagsremoved, removedcontent, head = remove_tag_attr(head, "meta", removal)
        alltagsfound += tagsfound
        alltagsremoved += tagsremoved
        allremovedcontent += removedcontent


    filecontent = filecontent[:headstart] + head + filecontent[headend:]
    return(alltagsremoved, allremovedcontent, filecontent)



# Remove picture tags.
def rem_picture(filecontent):
    picturestart = re.compile("<picture[ ]*[^>]*>", flags=reflags)
    pictureend = re.compile("</picture>[\s\n]*", flags=reflags)
    imgfull = re.compile("<img[^>*]*>[\s\n]*", flags=reflags)
    curpos = 0
    picturesfound = 0
    picturescontent = ""
    startmatch = picturestart.search(filecontent, curpos)
    while startmatch is not None:
        endmatch = pictureend.search(filecontent, startmatch.start())
        if endmatch is None:
            break
        picturesfound += 1
        imgmatch = imgfull.search(filecontent[startmatch.start():endmatch.end()])
        if imgmatch is None:
            filecontent = filecontent[:startmatch.start()] + filecontent[endmatch.end():]
            break
        else:
            picturescontent += filecontent[startmatch.start():endmatch.end()]
            filecontent = filecontent[:startmatch.start()] + imgmatch[0] + filecontent[endmatch.end():]
        startmatch = picturestart.search(filecontent, startmatch.start())

    return(picturesfound, picturescontent, filecontent)



# Remove div or span tags
def rem_div_span(filecontent):
    filecontent = re.sub("<span[ ]*[^>]*?/*>[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("</span>[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("<div[ ]*[^>]*?/*>[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("</div>[\s\n]*", "", filecontent, flags=reflags)
    return(-1, "", filecontent)



# Remove classes, IDs, names.
# Das geht schief. Man kann nur innerhalb suchen...
def rem_classes_ids_names(filecontent):
    filecontent = re.sub("class='[^']*'[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("class=\"[^\"]*\"[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("class=[^ ]*[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("id='[^']*'[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("id=\"[^\"]*\"[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("id=[^ ]*[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("name='[^']*'[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("name=\"[^\"]*\"[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("name=[^ ]*[\s\n]*", "", filecontent, flags=reflags)
    filecontent = re.sub("[\s\n]+data-[^=]*='[^']*?'[\s\n]*", " ", filecontent, flags=reflags)
    filecontent = re.sub("[\s\n]+data-[^=]*=\"[^\"]*?\"[\s\n]*", " ", filecontent, flags=reflags)
    filecontent = re.sub("[\s\n]+data-[^=]*=[^ ]*?[\s\n]*", " ", filecontent, flags=reflags)
    return(-1, "", filecontent)



# Remove empty lines and long empty spaces.
def rem_space(filecontent):
    filecontent = re.sub("\n\s*\n", "", filecontent, flags=reflags)
    filecontent = re.sub("> {2,}", "> ", filecontent)
    filecontent = re.sub(" >", ">", filecontent)
    return(-1, "", filecontent)



# Leave only visible tags.
def rem_novisible(filecontent):
    visibletags = []
    visibletags.append(re.compile("<h1[^>]*>.*?</h1>", flags=reflags))
    visibletags.append(re.compile("<h2[^>]*>.*?</h2>", flags=reflags))
    visibletags.append(re.compile("<h3[^>]*>.*?</h3>", flags=reflags))
    visibletags.append(re.compile("<h4[^>]*>.*?</h4>", flags=reflags))
    visibletags.append(re.compile("<h5[^>]*>.*?</h5>", flags=reflags))
    visibletags.append(re.compile("<h6[^>]*>.*?</h6>", flags=reflags))
    visibletags.append(re.compile("<p\s*>.*?</p>", flags=reflags))
    visibletags.append(re.compile("<p\s[^>]*>.*?</p>", flags=reflags))
    visibletags.append(re.compile("<a\s[^>]*>.*?</a>", flags=reflags))
    visibletags.append(re.compile("<ul\s[^>]*>.*?</ul>", flags=reflags))
    visibletags.append(re.compile("<ol\s[^>]*>.*?</ol>", flags=reflags))
    visibletags.append(re.compile("<li\s[^>]*>.*?</li>", flags=reflags))
    visibletags.append(re.compile("<b\s[^>]*>.*?</b>", flags=reflags))
    visibletags.append(re.compile("<i\s[^>]*>.*?</i>", flags=reflags))
    visibletags.append(re.compile("<q\s[^>]*>.*?</q>", flags=reflags))
    visibletags.append(re.compile("<u\s[^>]*>.*?</u>", flags=reflags))
    visibletags.append(re.compile("<br[^>]*[/]>", flags=reflags))
    visibletags.append(re.compile("<center\s[^>]*>.*?</center>", flags=reflags))
    visibletags.append(re.compile("<cite\s[^>]*>.*?</cite>", flags=reflags))
    visibletags.append(re.compile("<strong\s[^>]*>.*?</strong>", flags=reflags))
    visibletags.append(re.compile("<small\s[^>]*>.*?</small>", flags=reflags))
    visibletags.append(re.compile("<tiny\s[^>]*>.*?</tiny>", flags=reflags))
    visibletags.append(re.compile("<sup\s[^>]*>.*?</sup>", flags=reflags))
    visibletags.append(re.compile("<sub\s[^>]*>.*?</sub>", flags=reflags))

    bodyobj = re.search("<body[ ]*[^>]*>.*</body>", filecontent, flags=reflags)
    body = bodyobj[0]
    mintag = 1
    curpos = 0
    newbody = ""
    while mintag >= 0:
        nexttag = []
        for i in range(0,len(visibletags)):
            nexttag.append(visibletags[i].search(body, curpos))
        minpos = len(body)
        mintag = -1
        for i in range(0,len(visibletags)):
            if nexttag[i] is not None and nexttag[i].start() < minpos:
                minpos = nexttag[i].start()
                mintag = i

        if mintag < 0:
            break
        newbody += nexttag[mintag][0] + "\n\n"
        curpos = nexttag[mintag].end()

    filecontent = filecontent[:bodyobj.start()] + "<body>\n" + newbody + "</body>\n" + filecontent[bodyobj.end():]

    return(0, "", filecontent)


# XXX
# XXX
# XXX
def rem_button(filecontent):
    return("", "", filecontent)



# Remove the grouping <footer> tag. They are normally full of shit.
# The <footer> HTML element represents a footer for its nearest ancestor sectioning content or sectioning root element. A <footer> typically contains information about the author of the section, copyright data or links to related documents.
def rem_footer(filecontent):
    footercount, footercontent, filecontent = remove_tag(filecontent, "footer")
    return(footercount, footercontent, filecontent)


# XXX: AMP! link rel="amphtml"
# XXX: link rel="XXX" angucken und evtl. benutzen

# Split the file by its <header>, <nav>, etc. tags.
# Comments from MDN.
def rem_grouping(filecontent):
# XXX: Komplizierter. Die Tags kommen mehrere male vor, auch genestet, und haben sehr unterschidliche Bedeutungen
# Evtl 

    grouping = {}
    # <aside>
    # The <aside> HTML element represents a portion of a document whose content is only indirectly related to the document's main content. Asides are frequently presented as sidebars or call-out boxes.
    grouping["asidecnt"], grouping["aside"], filecontent = remove_tag(filecontent, "aside")
    # <header>
    # The <header> HTML element represents introductory content, typically a group of introductory or navigational aids. It may contain some heading elements but also a logo, a search form, an author name, and other elements.
    grouping["headercnt"], grouping["header"], filecontent = remove_tag(filecontent, "header")
    # <article>
    # The <article> HTML element represents a self-contained composition in a document, page, application, or site, which is intended to be independently distributable or reusable (e.g., in syndication). Examples include: a forum post, a magazine or newspaper article, or a blog entry, a product card, a user-submitted comment, an interactive widget or gadget, or any other independent item of content.
    # XXX: Achtung! Kann genestet sein!
    articlecnt, article, filecontent = remove_tag(filecontent, "article")
    # <nav>
    # The <nav> HTML element represents a section of a page whose purpose is to provide navigation links, either within the current document or to other documents. Common examples of navigation sections are menus, tables of contents, and indexes.
    navcnt, nav, filecontent = remove_tag(filecontent, "nav")
    # <address>
    # The <address> HTML element indicates that the enclosed HTML provides contact information for a person or people, or for an organization.
    addresscnt, address, filecontent = remove_tag(filecontent, "address")
    # <main>
    # The <main> HTML element represents the dominant content of the <body> of a document. The main content area consists of content that is directly related to or expands upon the central topic of a document, or the central functionality of an application.
    maincnt, main, filecontent = remove_tag(filecontent, "main")
    # <section>
    # The <section> HTML element represents a generic standalone section of a document, which doesn't have a more specific semantic element to represent it. Sections should always have a heading, with very few exceptions.
    sectioncnt, section, filecontent = remove_tag(filecontent, "section")

    # <hgroup>
    # The <hgroup> HTML element represents a heading and related content. It groups a single <h1>–<h6> element with one or more <p>.
    hgroupcnt, hgroup, filecontent = remove_tag(filecontent, "hgroup")

#    # XXX: Das hier machen.
#    with open(MAINFILE, "w") as fp:
#        fp.write(filecontent)
#    with open(NAVFILE, "w") as fp:
#        fp.write(navfile)
#    with open(SIDEBARS, "w") as fp:
#        fp.write(sidefile)
#    return(nav, additional, filecontent)
    return("", "", filecontent)

# List holding the options. It is a lot of manual labor otherwise and nobody will see what is happening anymore.
options = {
        "split": ("S", "Split the document by its <header>, <nav>, <main>, etc. parts.", rem_grouping),
        "vector": ("V", "Remove <svg> tags.", rem_svg),
        "script": ("s", "Remove <script> (Javascript) tags.", rem_script),
        "header": ("H", "Clean up the header, remove links etc.", rem_head),
        "picture": ("p", "Condense <picture> tags to the original <img> tag.", rem_picture),
        "classes": ("c", "Remove classes, IDs and names from all elements.", rem_classes_ids_names),
        "div": ("d", "Remove <div> and <span> tags.", rem_div_span),
        "whitespace": ("w", "Clean up non-visible whitespaces.", rem_space),
        "novisible": ("n", "Remove everything but <p>, <hX> and <a> tags.", rem_novisible),
        "style": ("C", "Remove <style> tags.", rem_style),
        "footer": ("f", "Remove the <footer>.", rem_footer),
        "button": ("b", "Remove buttons.", rem_button)
    }


def help(html = False):
    if not html:
        print("debloat.py [options] <infile> [outfile]")
        print("Remove stuff from given HTML infile. When outfile is not given, print to stdout.")
        print("Options:")
        for option in options.keys():
            print(" -%s --%s={yes,no}" % (option, options[option][0]))
            print("    %s" % options[option][1])
        print(" -o --owncss={dark,light}")
        print("    Insert own CSS.")
        print(" -v --verbose")
        print("    Be verbose about the operation. Breaks HTML.")
        print(" -h --help")
        print("    Print this help.")
    else:
        htmlheader("debloatproxy help page")
        print("Required arguments:")
        print("<ul>")
        print("  <li>site - website to proxy</li>")
        print("</ul>")
        print("Arguments:")
        print("<ul>")
        print("  <li>owncss - CSS template to use. Set to 'light' or 'dark'.</li>")
        print("</ul>")
        print("Arguments set to 1 or 0:")
        for option in options.keys():
            print("<li>%s - %s" % (option, options[option][1]))
        print("</ul>")
        htmlfooter()

def httpheader():
    print("Content-type:text/html\r\n\r\n")


def htmlheader(title = ""):
    print("Content-type:text/html\r\n\r\n")
    print("<html>")
    print("<head><title>%s</title><link rel='stylesheet' href='dark.css' /></head>" % title)
    print("<body>")

def htmlfooter():
    print("</body>")
    print("</html>")


set_options = {}
for option in options.keys():
    set_options[option] = True
set_options["split"] = False
set_options["novisible"] = False
set_options["owncss"] = "dark"


if "GATEWAY_INTERFACE" in os.environ.keys():
    arglist = parse.parse_qs(os.getenv("QUERY_STRING"))
    if "site" not in arglist.keys():
        helphtml()
        exit(0)

    for option in options.keys():
        if option in arglist.keys() and arglist[options[option][0]] == "0":
            set_options[option] = True
        elif option in arglist.keys() and arglist[options[option][0]] == "1":
            set_options[option] = False
        
    if "owncss" in arglist.keys() and (arglist["owncss"] != "dark" and arglist["owncss"] != "light"):
        set_options["owncss"] = ""
    elif "owncss" in arglist.keys():
        set_options["owncss"] = arglist["owncss"]

    if "help" in arglist.keys():
        helphtml()
        exit(0)

    try:
        with request.urlopen(arglist["site"][0], data=None, timeout=3) as response:
            charset = "utf-8"
            for item in response.headers.items():
                if item[0] == "Content-Type":
                    charsetobj = re.search("charset=\(.*\)", item[1])
                    if charsetobj is not None:
                        charset = charsetobj[0]
                    break
            origfile = response.read().decode(charset)
    except urllib.error.URLError as e:
        htmlheader(e.reason)
        print(e.reason)
        htmlfooter()
        exit(0)
    except urllib.error.HTTPError as e:
        content = e.read()
        htmlheader(content)
        print(wraphtml(str(e.code) + ": " + content))
        htmlfooter()
        exit(0)



else:
    optlist = []
    arglist = sys.argv[1:]
    # XXX: Argumentliste bauen
    getopt_shortlist = "ovh"
    getopt_longlist = ["owncss=", "verbose", "help"]
    for option in options.keys():
        getopt_shortlist += options[option][0]
        getopt_longlist.append("%s=" % option)
    if len(sys.argv) > 2:
        optlist, arglist = getopt.getopt(sys.argv[1:], getopt_shortlist, getopt_longlist)
    for opt, arg in optlist:
        if opt == "o" or opt == "--owncss":
            set_options["owncss"] = arg
            continue
        if opt == "v" or opt == "--verbose":
            optverbose = True
            continue
        if opt == "h" or opt == "--help":
            help()
            exit(0)

        found = 0
        for option in options.keys():
            if opt[2:] == option or opt == options[option][0]:
                if arg is None or arg == "yes" or arg == "true":
                    set_options[option] = True
                    syslog.syslog(syslog.LOG_ERR, option)
                    found = 1
                    break
                elif arg is not None and (arg == "no" or arg == "false"):
                    set_options[option] = False
                    found = 1
                    break
        if found != 1:
            help()
            exit(1)

    with open(arglist[0], "r") as fp:
        origfile = fp.read()

filecontent = origfile
for option in set_options.keys():
    if option == "owncss":
        filecontent = add_owncss(filecontent, set_options["owncss"])
        continue
    if set_options[option]:
        count, content, filecontent = options[option][2](filecontent)


if "GATEWAY_INTERFACE" not in os.environ.keys() and len(arglist) > 1:
    with open(arglist[1], "w") as fp:
        fp.write(filecontent)
else:
    httpheader()
    print(filecontent)



