tenfourfox/testing/web-platform/tests/html/tools/update_html5lib_tests.py

import sys
import os
import hashlib
import urllib
import itertools
import re
import json
import glob
import shutil

try:
    import genshi
    from genshi.template import MarkupTemplate

    from html5lib.tests import support
except ImportError:
    print """This script requires the Genshi templating library and html5lib source

It is recommended that these are installed in a virtualenv:

virtualenv venv
source venv/bin/activate
pip install genshi
cd venv
git clone git@github.com:html5lib/html5lib-python.git html5lib
cd html5lib
git submodule init
git submodule update
pip install -e ./

Then run this script again, with the virtual environment still active.
When you are done, type "deactivate" to deactivate the virtual environment.
"""

TESTS_PATH = "html/syntax/parsing/"

def get_paths():
    script_path = os.path.split(os.path.abspath(__file__))[0]
    repo_base = get_repo_base(script_path)
    tests_path = os.path.join(repo_base, TESTS_PATH)
    return script_path, tests_path

def get_repo_base(path):
    while path:
        if os.path.exists(os.path.join(path, ".git")):
            return path
        else:
            path = os.path.split(path)[0]

def get_expected(data):
    data = "#document\n" + data
    return data

def get_hash(data, container=None):
    if container == None:
        container = ""
    return hashlib.sha1("#container%s#data%s"%(container.encode("utf8"),
                                               data.encode("utf8"))).hexdigest()

def make_tests(script_dir, out_dir, input_file_name, test_data):
    tests = []
    innerHTML_tests = []
    ids_seen = {}
    print input_file_name
    for test in test_data:
        if "script-off" in test:
            continue
        is_innerHTML = "document-fragment" in test
        data = test["data"]
        container = test["document-fragment"] if is_innerHTML else None
        assert test["document"], test
        expected = get_expected(test["document"])
        test_list = innerHTML_tests if is_innerHTML else tests
        test_id = get_hash(data, container)
        if test_id in ids_seen:
            print "WARNING: id %s seen multiple times in file %s this time for test (%s, %s) before for test %s, skipping"%(test_id, input_file_name, container, data, ids_seen[test_id])
            continue
        ids_seen[test_id] = (container, data)
        test_list.append({'string_uri_encoded_input':"\"%s\""%urllib.quote(data.encode("utf8")),
                          'input':data,
                          'expected':expected,
                          'string_escaped_expected':json.dumps(urllib.quote(expected.encode("utf8"))),
                          'id':test_id,
                          'container':container
                          })
    path_normal = None
    if tests:
        path_normal = write_test_file(script_dir, out_dir,
                                      tests, "html5lib_%s"%input_file_name,
                                      "html5lib_test.xml")
    path_innerHTML = None
    if innerHTML_tests:
        path_innerHTML = write_test_file(script_dir, out_dir,
                                         innerHTML_tests, "html5lib_innerHTML_%s"%input_file_name,
                                         "html5lib_test_fragment.xml")

    return path_normal, path_innerHTML

def write_test_file(script_dir, out_dir, tests, file_name, template_file_name):
    file_name = os.path.join(out_dir, file_name + ".html")
    short_name = os.path.split(file_name)[1]

    with open(os.path.join(script_dir, template_file_name)) as f:
        template = MarkupTemplate(f)

    stream = template.generate(file_name=short_name, tests=tests)

    with open(file_name, "w") as f:
        f.write(stream.render('html', doctype='html5',
                              encoding="utf8"))
    return file_name

def escape_js_string(in_data):
    return in_data.encode("utf8").encode("string-escape")

def serialize_filenames(test_filenames):
    return "[" + ",\n".join("\"%s\""%item for item in test_filenames) + "]"

def main():

    script_dir, out_dir = get_paths()

    test_files = []
    inner_html_files = []

    if len(sys.argv) > 2:
        test_iterator = itertools.izip(
            itertools.repeat(False),
            sorted(os.path.abspath(item) for item in
                   glob.glob(os.path.join(sys.argv[2], "*.dat"))))
    else:
        test_iterator = itertools.chain(
            itertools.izip(itertools.repeat(False),
                           sorted(support.get_data_files("tree-construction"))),
            itertools.izip(itertools.repeat(True),
                           sorted(support.get_data_files(
                        os.path.join("tree-construction", "scripted")))))

    for (scripted, test_file) in test_iterator:
        input_file_name = os.path.splitext(os.path.split(test_file)[1])[0]
        if scripted:
            input_file_name = "scripted_" + input_file_name
        test_data = support.TestData(test_file)
        test_filename, inner_html_file_name = make_tests(script_dir, out_dir,
                                                         input_file_name, test_data)
        if test_filename is not None:
            test_files.append(test_filename)
        if inner_html_file_name is not None:
            inner_html_files.append(inner_html_file_name)

if __name__ == "__main__":
    main()