tenfourfox/testing/mozharness/scripts/vcs-sync/vcs_sync.py

#!/usr/bin/env python
# ***** BEGIN LICENSE BLOCK *****
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
# ***** END LICENSE BLOCK *****
"""vcs_sync.py

hg<->git conversions.  Needs to support both the monolithic beagle/gecko.git
type conversions, as well as many-to-many (l10n, build repos, etc.)
"""

from copy import deepcopy
import mmap
import os
import pprint
import re
import sys
import time

try:
    import simplejson as json
    assert json
except ImportError:
    import json

sys.path.insert(1, os.path.dirname(os.path.dirname(sys.path[0])))

import mozharness
external_tools_path = os.path.join(
    os.path.abspath(os.path.dirname(os.path.dirname(mozharness.__file__))),
    'external_tools',
)

from mozharness.base.errors import HgErrorList, GitErrorList
from mozharness.base.log import INFO, ERROR, FATAL
from mozharness.base.python import VirtualenvMixin, virtualenv_config_options
from mozharness.base.transfer import TransferMixin
from mozharness.base.vcs.vcssync import VCSSyncScript
from mozharness.mozilla.tooltool import TooltoolMixin


# HgGitScript {{{1
class HgGitScript(VirtualenvMixin, TooltoolMixin, TransferMixin, VCSSyncScript):
    """ Beagle-oriented hg->git script (lots of mozilla-central hardcodes;
        assumption that we're going to be importing lots of branches).

        Beagle is a git repo of mozilla-central, with full cvs history,
        and a number of developer-oriented repositories and branches added.

        The partner-oriented gecko.git could also be incorporated into this
        script with some changes.
        """

    mapfile_binary_search = None
    all_repos = None
    successful_repos = []
    config_options = [
        [["--no-check-incoming", ], {
            "action": "store_false",
            "dest": "check_incoming",
            "default": True,
            "help": "Don't check for incoming changesets",
        }],
        [["--max-log-sample-size", ], {
            "action": "store",
            "dest": "email_max_log_sample_size",
            "type": "int",
            "default": 102400,
            "help": "Specify the maximum number of characters from a log file to be "
                    "embedded in the email body, per embedding (not total - note we "
                    "embed two separate log samples into the email - so maximum "
                    "email body size can end up a little over 2x this amount).",
         }],
    ]

    def __init__(self, require_config_file=True):
        super(HgGitScript, self).__init__(
            config_options=virtualenv_config_options + self.config_options,
            all_actions=[
                'clobber',
                'list-repos',
                'create-virtualenv',
                'update-stage-mirror',
                'update-work-mirror',
                'create-git-notes',
                'publish-to-mapper',
                'push',
                'combine-mapfiles',
                'upload',
                'notify',
            ],
            # These default actions are the update loop that we run after the
            # initial steps to create the work mirror with all the branches +
            # cvs history have been run.
            default_actions=[
                'list-repos',
                'create-virtualenv',
                'update-stage-mirror',
                'update-work-mirror',
                'push',
                'combine-mapfiles',
                'upload',
                'notify',
            ],
            require_config_file=require_config_file
        )
        self.remote_targets = None

    # Helper methods {{{1
    def query_abs_dirs(self):
        """ Define paths.
            """
        if self.abs_dirs:
            return self.abs_dirs
        abs_dirs = super(HgGitScript, self).query_abs_dirs()
        abs_dirs['abs_cvs_history_dir'] = os.path.join(
            abs_dirs['abs_work_dir'], 'mozilla-cvs-history')
        abs_dirs['abs_source_dir'] = os.path.join(
            abs_dirs['abs_work_dir'], 'stage_source')
        abs_dirs['abs_repo_sync_tools_dir'] = os.path.join(
            abs_dirs['abs_work_dir'], 'repo-sync-tools')
        abs_dirs['abs_git_rewrite_dir'] = os.path.join(
            abs_dirs['abs_work_dir'], 'mc-git-rewrite')
        abs_dirs['abs_target_dir'] = os.path.join(abs_dirs['abs_work_dir'],
                                                  'target')
        if 'conversion_dir' in self.config:
            abs_dirs['abs_conversion_dir'] = os.path.join(
                abs_dirs['abs_work_dir'], 'conversion',
                self.config['conversion_dir']
            )
        self.abs_dirs = abs_dirs
        return self.abs_dirs

    def init_git_repo(self, path, additional_args=None, deny_deletes=False):
        """ Create a git repo, with retries.

            We call this with additional_args=['--bare'] to save disk +
            make things cleaner.
            """
        git = self.query_exe("git", return_type="list")
        cmd = git + ['init']
        # generally for --bare
        if additional_args:
            cmd.extend(additional_args)
        cmd.append(path)
        status = self.retry(
            self.run_command,
            args=(cmd, ),
            error_level=FATAL,
            error_message="Can't set up %s!" % path
        )
        status = self.run_command(
            git + ['config', 'receive.denyNonFastForwards', 'true'],
            cwd=path
        )
        if deny_deletes:
            status = self.run_command(
                git + ['config', 'receive.denyDeletes', 'true'],
                cwd=path
            )
        return status

    def write_hggit_hgrc(self, dest):
        """ Update .hg/hgrc, if not already updated
            """
        hgrc = os.path.join(dest, '.hg', 'hgrc')
        contents = ''
        if os.path.exists(hgrc):
            contents = self.read_from_file(hgrc)
        if 'hggit=' not in contents:
            hgrc_update = """[extensions]
hggit=
[git]
intree=1
"""
            self.write_to_file(hgrc, hgrc_update, open_mode='a')

    def _process_locale(self, locale, type, config, l10n_remote_targets, name, l10n_repos):
        """ This contains the common processing that we do on both gecko_config
            and gaia_config for a given locale.
            """
        replace_dict = {'locale': locale}
        new_targets = deepcopy(config.get('targets', {}))
        for target in new_targets:
            dest = target['target_dest']
            if '%(locale)s' in dest:
                new_dest = dest % replace_dict
                target['target_dest'] = new_dest
                remote_target = l10n_remote_targets.get(new_dest)
                if remote_target is None:  # generate target if not seen before
                    possible_remote_target = l10n_remote_targets.get(dest)
                    # target may be remote, or local - we can tell by seeing
                    # local targets will be none - remote targets will not,
                    # so we can use this to test if it is local or remote
                    if possible_remote_target is not None:
                        remote_target = deepcopy(possible_remote_target)
                        remote_repo = remote_target.get('repo')
                        if '%(locale)s' in remote_repo:
                            remote_target['repo'] = remote_repo % replace_dict
                        l10n_remote_targets[new_dest] = remote_target
        long_name = '%s_%s_%s' % (type, name, locale)
        repo_dict = {
            'repo': config['hg_url'] % replace_dict,
            'revision': 'default',
            'repo_name': long_name,
            'conversion_dir': long_name,
            'mapfile_name': '%s-mapfile' % long_name,
            'targets': new_targets,
            'vcs': 'hg',
            'branch_config': {
                'branches': {
                    'default': config['git_branch_name'],
                },
            },
            'tag_config': config.get('tag_config', {}),
            'mapper': config.get('mapper', {}),
            'generate_git_notes': config.get('generate_git_notes', {}),
        }
        l10n_repos.append(repo_dict)

    def _query_l10n_repos(self):
        """ Since I didn't want to have to build a huge static list of l10n
            repos, and since it would be nicest to read the list of locales
            from their SSoT files.
            """
        l10n_repos = []
        l10n_remote_targets = deepcopy(self.config['remote_targets'])
        dirs = self.query_abs_dirs()
        gecko_dict = deepcopy(self.config['l10n_config'].get('gecko_config', {}))
        for name, gecko_config in gecko_dict.items():
            file_name = self.download_file(gecko_config['locales_file_url'],
                                           parent_dir=dirs['abs_work_dir'])
            if not os.path.exists(file_name):
                self.error("Can't download locales from %s; skipping!" % gecko_config['locales_file_url'])
                continue
            contents = self.read_from_file(file_name)
            for locale in contents.splitlines():
                self._process_locale(locale, 'gecko', gecko_config, l10n_remote_targets, name, l10n_repos)

        gaia_dict = deepcopy(self.config['l10n_config'].get('gaia_config', {}))
        for name, gaia_config in gaia_dict.items():
            contents = self.retry(
                self.load_json_from_url,
                args=(gaia_config['locales_file_url'],)
            )
            if not contents:
                self.error("Can't download locales from %s; skipping!" % gaia_config['locales_file_url'])
                continue
            for locale in dict(contents).keys():
                self._process_locale(locale, 'gaia', gaia_config, l10n_remote_targets, name, l10n_repos)
        self.info("Built l10n_repos...")
        self.info(pprint.pformat(l10n_repos, indent=4))
        self.info("Remote targets...")
        self.info(pprint.pformat(l10n_remote_targets, indent=4))
        self.remote_targets = l10n_remote_targets
        return l10n_repos

    def _query_project_repos(self):
        """ Since I didn't want to have to build a huge static list of project
            branch repos.
            """
        project_repos = []
        for project in self.config.get("project_branches", []):
            repo_dict = {
                'repo': self.config['project_branch_repo_url'] % {'project': project},
                'revision': 'default',
                'repo_name': project,
                'targets': [{
                    'target_dest': 'github-project-branches',
                }],
                'vcs': 'hg',
                'branch_config': {
                    'branches': {
                        'default': project,
                    },
                },
                'tag_config': {},
            }
            project_repos.append(repo_dict)
        self.info("Built project_repos...")
        self.info(pprint.pformat(project_repos, indent=4))
        return project_repos

    def query_all_repos(self):
        """ Very simple method, but we need this concatenated list many times
            throughout the script.
            """
        if self.all_repos:
            return self.all_repos
        if self.config.get('conversion_type') == 'b2g-l10n':
            self.all_repos = self._query_l10n_repos()
        elif self.config.get('initial_repo'):
            self.all_repos = [self.config['initial_repo']] + list(self.config.get('conversion_repos', []))
        else:
            self.all_repos = list(self.config.get('conversion_repos', []))
        if self.config.get('conversion_type') == 'project-branches':
            self.all_repos += self._query_project_repos()
        return self.all_repos

    def query_all_non_failed_repos(self):
        """ Same as query_all_repos(self) but filters out repos that failed in an earlier
            action - so use this for downstream actions that require earlier actions did
            not fail for a given repo.
            """
        all_repos = self.query_all_repos()
        return [repo for repo in all_repos if repo.get('repo_name') not in self.failures]

    def _query_repo_previous_status(self, repo_name, repo_map=None):
        """ Return False if previous run was unsuccessful.
            Return None if no previous run information.
            """
        if repo_map is None:
            repo_map = self._read_repo_update_json()
        return repo_map.get('repos', {}).get(repo_name, {}).get('previous_push_successful')

    def _update_repo_previous_status(self, repo_name, successful_flag, repo_map=None, write_update=False):
        """ Set the repo_name to successful_flag (False for unsuccessful, True for successful)
            """
        if repo_map is None:
            repo_map = self._read_repo_update_json()
        repo_map.setdefault('repos', {}).setdefault(repo_name, {})['previous_push_successful'] = successful_flag
        if write_update:
            self._write_repo_update_json(repo_map)
        return repo_map

    def _update_stage_repo(self, repo_config, retry=True, clobber=False):
        """ Update a stage repo.
            See update_stage_mirror() for a description of the stage repos.
            """
        hg = self._query_hg_exe()
        dirs = self.query_abs_dirs()
        repo_name = repo_config['repo_name']
        source_dest = os.path.join(dirs['abs_source_dir'],
                                   repo_name)
        if clobber:
            self.rmtree(source_dest)
        if not os.path.exists(source_dest):
            if self.retry(
                self.run_command,
                args=(hg + ['clone', '--noupdate', repo_config['repo'],
                      source_dest], ),
                kwargs={
                    'output_timeout': 15 * 60,
                    'cwd': dirs['abs_work_dir'],
                    'error_list': HgErrorList,
                },
            ):
                if retry:
                    return self._update_stage_repo(
                        repo_config, retry=False, clobber=True)
                else:
                    # Don't leave a failed clone behind
                    self.rmtree(source_dest)
                    self._update_repo_previous_status(repo_name, successful_flag=False, write_update=True)
                    self.add_failure(
                        repo_name,
                        message="Can't clone %s!" % repo_config['repo'],
                        level=ERROR,
                    )
        elif self.config['check_incoming'] and repo_config.get("check_incoming", True):
            previous_status = self._query_repo_previous_status(repo_name)
            if previous_status is None:
                self.info("No previous status for %s; skipping incoming check!" % repo_name)
            elif previous_status is False:
                self.info("Previously unsuccessful status for %s; skipping incoming check!" % repo_name)
            else:
                # Run |hg incoming| and skip all subsequent actions if there
                # are no no changes.
                # If you want to bypass this behavior (e.g. to update branches/tags
                # on a repo without requiring a new commit), set
                # repo_config["incoming_check"] = False.
                cmd = hg + ['incoming', '-n', '-l', '1']
                status = self.retry(
                    self.run_command,
                    args=(cmd, ),
                    kwargs={
                        'output_timeout': 5 * 60,
                        'cwd': source_dest,
                        'error_list': HgErrorList,
                        'success_codes': [0, 1, 256],
                    },
                )
                if status in (1, 256):
                    self.info("No changes for %s; skipping." % repo_name)
                    # Overload self.failures to tell downstream actions to noop on
                    # this repo
                    self.failures.append(repo_name)
                    return
                elif status != 0:
                    self.add_failure(
                        repo_name,
                        message="Error getting changes for %s; skipping!" % repo_config['repo_name'],
                        level=ERROR,
                    )
                    self._update_repo_previous_status(repo_name, successful_flag=False, write_update=True)
                    return
        cmd = hg + ['pull']
        if self.retry(
            self.run_command,
            args=(cmd, ),
            kwargs={
                'output_timeout': 15 * 60,
                'cwd': source_dest,
                'error_list': HgErrorList,
            },
        ):
            if retry:
                return self._update_stage_repo(
                    repo_config, retry=False, clobber=True)
            else:
                self._update_repo_previous_status(repo_name, successful_flag=False, write_update=True)
                self.add_failure(
                    repo_name,
                    message="Can't pull %s!" % repo_config['repo'],
                    level=ERROR,
                )
        # commenting out hg verify since it takes ~5min per repo; hopefully
        # exit codes will save us
#        if self.run_command(hg + ["verify"], cwd=source_dest):
#            if retry:
#                return self._update_stage_repo(repo_config, retry=False, clobber=True)
#            else:
#                self.fatal("Can't verify %s!" % source_dest)

    def _do_push_repo(self, base_command, refs_list=None, kwargs=None):
        """ Helper method for _push_repo() since it has to be able to break
            out of the target_repo list loop, and the commands loop borks that.
            """
        commands = []
        if refs_list:
            while len(refs_list) > 10:
                commands.append(base_command + refs_list[0:10])
                refs_list = refs_list[10:]
            commands.append(base_command + refs_list)
        else:
            commands = [base_command]
        if kwargs is None:
            kwargs = {}
        for command in commands:
            # Do the push, with retry!
            if self.retry(
                self.run_command,
                args=(command, ),
                kwargs=kwargs,
            ):
                return -1

    def _push_repo(self, repo_config):
        """ Push a repo to a path ("test_push") or remote server.

            This was meant to be a cross-vcs method, but currently only
            covers git pushes.
            """
        dirs = self.query_abs_dirs()
        conversion_dir = self.query_abs_conversion_dir(repo_config)
        if not conversion_dir:
            self.fatal("No conversion_dir for %s!" % repo_config['repo_name'])
        source_dir = os.path.join(dirs['abs_source_dir'], repo_config['repo_name'])
        git = self.query_exe('git', return_type='list')
        hg = self._query_hg_exe()
        return_status = ''
        for target_config in repo_config['targets']:
            test_push = False
            remote_config = {}
            if target_config.get("test_push"):
                test_push = True
                force_push = target_config.get("force_push")
                target_name = os.path.join(
                    dirs['abs_target_dir'], target_config['target_dest'])
                target_vcs = target_config.get("vcs")
            else:
                target_name = target_config['target_dest']
                if self.remote_targets is None:
                    self.remote_targets = self.config.get('remote_targets', {})
                remote_config = self.remote_targets.get(target_name, target_config)
                force_push = remote_config.get("force_push", target_config.get("force_push"))
                target_vcs = remote_config.get("vcs", target_config.get("vcs"))
            if target_vcs == "git":
                base_command = git + ['push']
                env = {}
                if force_push:
                    base_command.append("-f")
                if test_push:
                    target_git_repo = target_name
                else:
                    target_git_repo = remote_config['repo']
                    # Allow for using a custom git ssh key.
                    env['GIT_SSH_KEY'] = remote_config['ssh_key']
                    env['GIT_SSH'] = os.path.join(external_tools_path, 'git-ssh-wrapper.sh')
                base_command.append(target_git_repo)
                # Allow for pushing a subset of repo branches to the target.
                # If we specify that subset, we can also specify different
                # names for those branches (e.g. b2g18 -> master for a
                # standalone b2g18 repo)
                # We query hg for these because the conversion dir will have
                # branches from multiple hg repos, and the regexes may match
                # too many things.
                refs_list = []
                if repo_config.get('generate_git_notes', False):
                    refs_list.append('+refs/notes/commits:refs/notes/commits')
                branch_map = self.query_branches(
                    target_config.get('branch_config', repo_config.get('branch_config', {})),
                    source_dir,
                )
                # If the target_config has a branch_config, the key is the
                # local git branch and the value is the target git branch.
                if target_config.get("branch_config"):
                    for (branch, target_branch) in branch_map.items():
                        refs_list += ['+refs/heads/%s:refs/heads/%s' % (branch, target_branch)]
                # Otherwise the key is the hg branch and the value is the git
                # branch; use the git branch for both local and target git
                # branch names.
                else:
                    for (hg_branch, git_branch) in branch_map.items():
                        refs_list += ['+refs/heads/%s:refs/heads/%s' % (git_branch, git_branch)]
                # Allow for pushing a subset of tags to the target, via name or
                # regex.  Again, query hg for this list because the conversion
                # dir will contain tags from multiple hg repos, and the regexes
                # may match too many things.
                tag_config = target_config.get('tag_config', repo_config.get('tag_config', {}))
                if tag_config.get('tags'):
                    for (tag, target_tag) in tag_config['tags'].items():
                        refs_list += ['+refs/tags/%s:refs/tags/%s' % (tag, target_tag)]
                if tag_config.get('tag_regexes'):
                    regex_list = []
                    for regex in tag_config['tag_regexes']:
                        regex_list.append(re.compile(regex))
                    tag_list = self.get_output_from_command(
                        hg + ['tags'],
                        cwd=source_dir,
                    )
                    if tag_list is not None:
                        for tag_line in tag_list.splitlines():
                            if not tag_line:
                                continue
                            tag_parts = tag_line.split()
                            if not tag_parts:
                                self.error("Bogus tag_line? %s" % str(tag_line))
                                continue
                            tag_name = tag_parts[0]
                            for regex in regex_list:
                                if tag_name != 'tip' and regex.search(tag_name) is not None:
                                    refs_list += ['+refs/tags/%s:refs/tags/%s' % (tag_name, tag_name)]
                                    continue
                error_msg = "%s: Can't push %s to %s!\n" % (repo_config['repo_name'], conversion_dir, target_git_repo)
                if self._do_push_repo(
                    base_command,
                    refs_list=refs_list,
                    kwargs={
                        'output_timeout': target_config.get("output_timeout", 30 * 60),
                        'cwd': os.path.join(conversion_dir, '.git'),
                        'error_list': GitErrorList,
                        'partial_env': env,
                    }
                ):
                    if target_config.get("test_push"):
                        error_msg += "This was a test push that failed; not proceeding any further with %s!\n" % repo_config['repo_name']
                    self.error(error_msg)
                    return_status += error_msg
                    if target_config.get("test_push"):
                        break
            else:
                # TODO write hg
                error_msg = "%s: Don't know how to deal with vcs %s!\n" % (
                    target_config['target_dest'], target_vcs)
                self.error(error_msg)
                return_status += error_msg
        return return_status

    def _query_mapped_revision(self, revision=None, mapfile=None):
        """ Use the virtualenv mapper module to search a mapfile for a
            revision.
            """
        if not callable(self.mapfile_binary_search):
            site_packages_path = self.query_python_site_packages_path()
            sys.path.append(os.path.join(site_packages_path, 'mapper'))
            try:
                from bsearch import mapfile_binary_search
                global log
                log = self.log_obj
                self.mapfile_binary_search = mapfile_binary_search
            except ImportError, e:
                self.fatal("Can't import mapfile_binary_search! %s\nDid you create-virtualenv?" % str(e))
        # I wish mapper did this for me, but ...
        fd = open(mapfile, 'rb')
        m = mmap.mmap(fd.fileno(), 0, mmap.MAP_PRIVATE, mmap.PROT_READ)
        return self.mapfile_binary_search(m, revision)

    def _post_fatal(self, message=None, exit_code=None):
        """ After we call fatal(), run this method before exiting.
            """
        if 'notify' in self.actions:
            self.notify(message=message, fatal=True)
        self.copy_logs_to_upload_dir()

    def _read_repo_update_json(self):
        """ repo_update.json is a file we create with information about each
            repo we're converting: git/hg branch names, git/hg revisions,
            pull datetime/timestamp, and push datetime/timestamp.

            Since we want to be able to incrementally update portions of this
            file as we pull/push each branch, we need to be able to read the
            json into memory, so we can update the dict and re-write the json
            to disk.
            """
        repo_map = {}
        dirs = self.query_abs_dirs()
        path = os.path.join(dirs['abs_upload_dir'], 'repo_update.json')
        if os.path.exists(path):
            fh = open(path, 'r')
            repo_map = json.load(fh)
            fh.close()
        return repo_map

    def query_abs_conversion_dir(self, repo_config):
        dirs = self.query_abs_dirs()
        if repo_config.get('conversion_dir'):
            dest = os.path.join(dirs['abs_work_dir'], 'conversion',
                                repo_config['conversion_dir'])
        else:
            dest = dirs.get('abs_conversion_dir')
        return dest

    def _write_repo_update_json(self, repo_map):
        """ The write portion of _read_repo_update_json().
            """
        dirs = self.query_abs_dirs()
        contents = json.dumps(repo_map, sort_keys=True, indent=4)
        self.write_to_file(
            os.path.join(dirs['abs_upload_dir'], 'repo_update.json'),
            contents,
            create_parent_dir=True
        )

    def _query_hg_exe(self):
        """Returns the hg executable command as a list
        """
        # If "hg" is set in "exes" section of config use that.
        # If not, get path from self.query_virtualenv_path() method
        # (respects --work-dir and --venv-path and --virtualenv-path).
        exe_command = self.query_exe('hg', return_type="list", default=[os.path.join(self.query_virtualenv_path(), "bin", "hg")])

        # possible additional command line options can be specified in "hg_options" of self.config
        hg_options = self.config.get("hg_options", ())
        exe_command.extend(hg_options)
        return exe_command

    def query_branches(self, branch_config, repo_path, vcs='hg'):
        """ Given a branch_config of branches and branch_regexes, return
            a dict of existing branch names to target branch names.
            """
        branch_map = {}
        if "branches" in branch_config:
            branch_map = deepcopy(branch_config['branches'])
        if "branch_regexes" in branch_config:
            regex_list = list(branch_config['branch_regexes'])
            full_branch_list = []
            if vcs == 'hg':
                hg = self._query_hg_exe()
                # This assumes we always want closed branches as well.
                # If not we may need more options.
                output = self.get_output_from_command(
                    hg + ['branches', '-a'],
                    cwd=repo_path
                )
                if output:
                    for line in output.splitlines():
                        full_branch_list.append(line.split()[0])
            elif vcs == 'git':
                git = self.query_exe("git", return_type="list")
                output = self.get_output_from_command(
                    git + ['branch', '-l'],
                    cwd=repo_path
                )
                if output:
                    for line in output.splitlines():
                        full_branch_list.append(line.replace('*', '').split()[0])
            for regex in regex_list:
                for branch in full_branch_list:
                    m = re.search(regex, branch)
                    if m:
                        # Don't overwrite branch_map[branch] if it exists
                        branch_map.setdefault(branch, branch)
        return branch_map

    def _combine_mapfiles(self, mapfiles, combined_mapfile, cwd=None):
        """ Adapted from repo-sync-tools/combine_mapfiles

            Consolidate multiple conversion processes' mapfiles into a
            single mapfile.
            """
        self.info("Determining whether we need to combine mapfiles...")
        if cwd is None:
            cwd = self.query_abs_dirs()['abs_upload_dir']
        existing_mapfiles = []
        for f in mapfiles:
            f_path = os.path.join(cwd, f)
            if os.path.exists(f_path):
                existing_mapfiles.append(f)
            else:
                self.warning("%s doesn't exist!" % f_path)
        combined_mapfile_path = os.path.join(cwd, combined_mapfile)
        if os.path.exists(combined_mapfile_path):
            combined_timestamp = os.path.getmtime(combined_mapfile_path)
            for f in existing_mapfiles:
                f_path = os.path.join(cwd, f)
                if os.path.getmtime(f_path) > combined_timestamp:
                    # Yes, we want to combine mapfiles
                    break
            else:
                self.info("No new mapfiles to combine.")
                return
            self.move(combined_mapfile_path, "%s.old" % combined_mapfile_path)
        output = self.get_output_from_command(
            ['sort', '--unique', '-t', ' ',
             '--key=2'] + existing_mapfiles,
            silent=True, halt_on_failure=True,
            cwd=cwd,
        )
        self.write_to_file(combined_mapfile_path, output, verbose=False,
                           error_level=FATAL)
        self.run_command(['ln', '-sf', combined_mapfile,
                          '%s-latest' % combined_mapfile],
                         cwd=cwd)

    # Actions {{{1

    def list_repos(self):
        repos = self.query_all_repos()
        self.info(pprint.pformat(repos))

    def create_test_targets(self):
        """ This action creates local directories to do test pushes to.
            """
        dirs = self.query_abs_dirs()
        for repo_config in self.query_all_non_failed_repos():
            for target_config in repo_config['targets']:
                if not target_config.get('test_push'):
                    continue
                target_dest = os.path.join(dirs['abs_target_dir'], target_config['target_dest'])
                if not os.path.exists(target_dest):
                    self.info("Creating local target repo %s." % target_dest)
                    if target_config.get("vcs", "git") == "git":
                        self.init_git_repo(target_dest, additional_args=['--bare', '--shared=all'],
                                           deny_deletes=True)
                    else:
                        self.fatal("Don't know how to deal with vcs %s!" % target_config['vcs'])
                else:
                    self.debug("%s exists; skipping." % target_dest)

    def update_stage_mirror(self):
        """ The stage mirror is a buffer clean clone of repositories.
            The logic behind this is that we get occasional corruption from
            |hg pull|.  It's much less time-consuming to detect this in
            a clean clone, and reclone, than to detect this in a working
            conversion directory, and try to repair or reclone+reconvert.

            We pull the stage mirror into the work mirror, where the conversion
            is done.
            """
        for repo_config in self.query_all_non_failed_repos():
            self._update_stage_repo(repo_config)

    def pull_out_new_sha_lookups(self, old_file, new_file):
        """ This method will return an iterator which iterates through lines in file
            new_file that do not exist in old_file. If old_file can't be read, all
            lines in new_file are returned. It does not cause any problems if lines
            exist in old_file that do not exist in new_file. Results are sorted by
            the second field (text after first space in line).

            This is somewhat equivalent to:
               ( [ ! -f "${old_file}" ] && cat "${new_file}" || diff "${old_file}" "${new_file}" | sed -n 's/> //p' ) | sort -k2"""
        with self.opened(old_file) as (old, err):
            if err:
                self.info('Map file %s not found - probably first time this has run.' % old_file)
                old_set = frozenset()
            else:
                old_set = frozenset(old)
        with self.opened(new_file, 'rt') as (new, err):
            if err:
                self.error('Could not read contents of map file %s:\n%s' % (new_file, err.message))
                new_set = frozenset()
            else:
                new_set = frozenset(new)
        for line in sorted(new_set.difference(old_set), key=lambda line: line.partition(' ')[2]):
            yield line

    def update_work_mirror(self):
        """ Pull the latest changes into the work mirror, update the repo_map
            json, and run |hg gexport| to convert those latest changes into
            the git conversion repo.
            """
        hg = self._query_hg_exe()
        git = self.query_exe("git", return_type="list")
        dirs = self.query_abs_dirs()
        repo_map = self._read_repo_update_json()
        timestamp = int(time.time())
        datetime = time.strftime('%Y-%m-%d %H:%M %Z')
        repo_map['last_pull_timestamp'] = timestamp
        repo_map['last_pull_datetime'] = datetime
        for repo_config in self.query_all_non_failed_repos():
            repo_name = repo_config['repo_name']
            source = os.path.join(dirs['abs_source_dir'], repo_name)
            dest = self.query_abs_conversion_dir(repo_config)
            if not dest:
                self.fatal("No conversion_dir for %s!" % repo_name)
            if not os.path.exists(dest):
                self.mkdir_p(os.path.dirname(dest))
                self.run_command(hg + ['clone', '--noupdate', source, dest],
                                 error_list=HgErrorList,
                                 halt_on_failure=False)
                if os.path.exists(dest):
                    self.write_hggit_hgrc(dest)
                    self.init_git_repo('%s/.git' % dest, additional_args=['--bare'])
                    self.run_command(
                    git + ['--git-dir', '%s/.git' % dest, 'config', 'gc.auto', '0'],
                    )
                else:
                    self.add_failure(
                        repo_name,
                        message="Failed to clone %s!" % source,
                        level=ERROR,
                    )
                    continue
            # Build branch map.
            branch_map = self.query_branches(
                repo_config.get('branch_config', {}),
                source,
            )
            for (branch, target_branch) in branch_map.items():
                output = self.get_output_from_command(
                    hg + ['id', '-r', branch],
                    cwd=source
                )
                if output:
                    rev = output.split(' ')[0]
                else:
                    self.add_failure(
                        repo_name,
                        message="Branch %s doesn't exist in %s (%s cloned into staging directory %s)!" % (branch, repo_name, repo_config.get('repo'), source),
                        level=ERROR,
                    )
                    continue
                timestamp = int(time.time())
                datetime = time.strftime('%Y-%m-%d %H:%M %Z')
                if self.run_command(hg + ['pull', '-r', rev, source], cwd=dest,
                                    error_list=HgErrorList):
                    # We shouldn't have an issue pulling!
                    self.add_failure(
                        repo_name,
                        message="Unable to pull %s from stage_source; clobbering and skipping!" % repo_name,
                        level=ERROR,
                    )
                    self._update_repo_previous_status(repo_name, successful_flag=False, write_update=True)
                    # don't leave a dirty checkout behind, and skip remaining branches
                    self.rmtree(source)
                    break
                self.run_command(
                    hg + ['bookmark', '-f', '-r', rev, target_branch],
                    cwd=dest, error_list=HgErrorList,
                )
                # This might get a little large.
                repo_map.setdefault('repos', {}).setdefault(repo_name, {}).setdefault('branches', {})[branch] = {
                    'hg_branch': branch,
                    'hg_revision': rev,
                    'git_branch': target_branch,
                    'pull_timestamp': timestamp,
                    'pull_datetime': datetime,
                }
            if self.query_failure(repo_name):
                # We hit an error in the for loop above
                continue
            self.retry(
                self.run_command,
                args=(hg + ['-v', 'gexport'], ),
                kwargs={
                    'output_timeout': 15 * 60,
                    'cwd': dest,
                    'error_list': HgErrorList,
                },
                error_level=FATAL,
            )
            generated_mapfile = os.path.join(dest, '.hg', 'git-mapfile')
            self.copy_to_upload_dir(
                generated_mapfile,
                dest=repo_config.get('mapfile_name', self.config.get('mapfile_name', "gecko-mapfile")),
                log_level=INFO
            )
            for (branch, target_branch) in branch_map.items():
                git_revision = self._query_mapped_revision(
                    revision=rev, mapfile=generated_mapfile)
                repo_map['repos'][repo_name]['branches'][branch]['git_revision'] = git_revision
        self._write_repo_update_json(repo_map)


    def create_git_notes(self):
        git = self.query_exe("git", return_type="list")
        for repo_config in self.query_all_non_failed_repos():
            repo = repo_config['repo']
            if repo_config.get('generate_git_notes', False):
                dest = self.query_abs_conversion_dir(repo_config)
                # 'git-mapfile' is created by hggit plugin, containing all the mappings
                complete_mapfile = os.path.join(dest, '.hg', 'git-mapfile')
                # 'added-to-git-notes' is the set of mappings known to be recorded in the git notes
                # of the project (typically 'git-mapfile' from previous run)
                added_to_git_notes = os.path.join(dest, '.hg', 'added-to-git-notes')
                # 'delta-git-notes' is the set of new mappings found on this iteration, that
                # now need to be added to the git notes of the project (the diff between the
                # previous two files described)
                delta_git_notes = os.path.join(dest, '.hg', 'delta-git-notes')
                git_dir = os.path.join(dest, '.git')
                self.rmtree(delta_git_notes)
                git_notes_adding_successful = True
                with self.opened(delta_git_notes, open_mode='w') as (delta_out, err):
                    if err:
                        git_notes_adding_successful = False
                        self.warn("Could not write list of unprocessed git note mappings to file %s - not critical" % delta_git_notes)
                    else:
                        for sha_lookup in self.pull_out_new_sha_lookups(added_to_git_notes, complete_mapfile):
                            print >>delta_out, sha_lookup,
                            (git_sha, hg_sha) = sha_lookup.split()
                            # only add git note if not already there - note
                            # devs may have added their own notes, so don't
                            # replace any existing notes, just add to them
                            output = self.get_output_from_command(
                                git + ['notes', 'show', git_sha],
                                cwd=git_dir,
                                ignore_errors=True
                            )
                            git_note_text='Upstream source: %s/rev/%s' % (repo, hg_sha)
                            git_notes_add_return_code = 1
                            if not output or output.find(git_note_text) < 0:
                                git_notes_add_return_code = self.run_command(
                                    git + ['notes', 'append', '-m', git_note_text, git_sha],
                                    cwd=git_dir
                                )
                            # if note was successfully added, or it was already there, we can
                            # mark it as added, by putting it in the delta file...
                            if git_notes_add_return_code == 0 or output.find(git_note_text) >= 0:
                                print >>delta_out, sha_lookup,
                            else:
                                self.error("Was not able to append required git note for git commit %s ('%s')" % (git_sha, git_note_text))
                                git_notes_adding_successful = False
                if git_notes_adding_successful:
                    self.copyfile(complete_mapfile, added_to_git_notes)
            else:
                self.info("Not creating git notes for repo %s (generate_git_notes not set to True)" % repo)

    def publish_to_mapper(self):
        """ This method will attempt to create git notes for any new git<->hg mappings
            found in the generated_mapfile file and also push new mappings to mapper service."""
        for repo_config in self.query_all_non_failed_repos():
            dest = self.query_abs_conversion_dir(repo_config)
            # 'git-mapfile' is created by hggit plugin, containing all the mappings
            complete_mapfile = os.path.join(dest, '.hg', 'git-mapfile')
            # 'published-to-mapper' is all the mappings that are known to be published
            # to mapper, for this project (typically the 'git-mapfile' from the previous
            # run)
            published_to_mapper = os.path.join(dest, '.hg', 'published-to-mapper')
            # 'delta-for-mapper' is the set of mappings that need to be published to
            # mapper on this iteration, i.e. the diff between the previous two files
            # described
            delta_for_mapper = os.path.join(dest, '.hg', 'delta-for-mapper')
            self.rmtree(delta_for_mapper)
            # we only replace published_to_mapper if we successfully updated
            # pushed to mapper
            mapper_config = repo_config.get('mapper', {})
            if mapper_config:
                site_packages_path = self.query_python_site_packages_path()
                if site_packages_path not in sys.path:
                    sys.path.append(site_packages_path)
                try:
                    import requests
                except ImportError as e:
                    self.error("Can't import requests: %s\nDid you create-virtualenv?" % str(e))
                mapper_url = mapper_config['url']
                mapper_project = mapper_config['project']
                insert_url = "%s/%s/insert/ignoredups" % (mapper_url, mapper_project)
                headers = {
                    'Content-Type': 'text/plain',
                    'Authentication': 'Bearer %s' % os.environ["RELENGAPI_INSERT_HGGIT_MAPPINGS_AUTH_TOKEN"]
                }
                all_new_mappings = []
                all_new_mappings.extend(self.pull_out_new_sha_lookups(published_to_mapper, complete_mapfile))
                self.write_to_file(delta_for_mapper, "".join(all_new_mappings))
                # due to timeouts on load balancer, we only push 200 lines at a time
                # this means that we should get http response back within 30 seconds
                # including the time it takes to insert the mappings in the database
                publish_successful = True
                for i in range(0, len(all_new_mappings), 200):
                    r = requests.post(insert_url, data="".join(all_new_mappings[i:i+200]), headers=headers)
                    if (r.status_code != 200):
                        self.error("Could not publish mapfile ('%s') line range [%s, %s] to mapper (%s) - received http %s code" % (delta_for_mapper, i, i+200, insert_url, r.status_code))
                        publish_successful = False
                        # we won't break out, since we may be able to publish other mappings
                        # and duplicates are allowed, so we will push the whole lot again next
                        # time anyway
                    else:
                        self.info("Published mapfile ('%s') line range [%s, %s] to mapper (%s)" % (delta_for_mapper, i, i+200, insert_url))
                if publish_successful:
                    # if we get this far, we know we could successfully post to mapper, so now
                    # we can copy the mapfile over "previously generated" version
                    # so that we don't push to mapper for these commits again
                    self.copyfile(complete_mapfile, published_to_mapper)
            else:
                self.copyfile(complete_mapfile, published_to_mapper)

    def combine_mapfiles(self):
        """ This method is for any job (l10n, project-branches) that needs to combine
            mapfiles.
            """
        if not self.config.get("combined_mapfile"):
            self.info("No combined_mapfile set in config; skipping!")
            return
        dirs = self.query_abs_dirs()
        mapfiles = []
        if self.config.get('conversion_type') == 'b2g-l10n':
            for repo_config in self.query_all_non_failed_repos():
                if repo_config.get("mapfile_name"):
                    mapfiles.append(repo_config['mapfile_name'])
        else:
            mapfiles.append(self.config.get('mapfile_name', 'gecko-mapfile'))
        if self.config.get('external_mapfile_urls'):
            for url in self.config['external_mapfile_urls']:
                file_name = self.download_file(
                    url,
                    parent_dir=dirs['abs_upload_dir'],
                    error_level=FATAL,
                )
                mapfiles.append(file_name)
        if not mapfiles:
            self.info("No mapfiles to combine; skipping!")
            return
        self._combine_mapfiles(mapfiles, self.config['combined_mapfile'])

    def push(self):
        """ Push to all targets.  test_targets are local directory test repos;
            the rest are remote.  Updates the repo_map json.
            """
        self.create_test_targets()
        repo_map = self._read_repo_update_json()
        failure_msg = ""
        timestamp = int(time.time())
        datetime = time.strftime('%Y-%m-%d %H:%M %Z')
        repo_map['last_push_timestamp'] = timestamp
        repo_map['last_push_datetime'] = datetime
        for repo_config in self.query_all_non_failed_repos():
            timestamp = int(time.time())
            datetime = time.strftime('%Y-%m-%d %H:%M %Z')
            status = self._push_repo(repo_config)
            repo_name = repo_config['repo_name']
            if not status:  # good
                if repo_name not in self.successful_repos:
                    self.successful_repos.append(repo_name)
                repo_map.setdefault('repos', {}).setdefault(repo_name, {})['push_timestamp'] = timestamp
                repo_map['repos'][repo_name]['push_datetime'] = datetime
                previous_status = self._query_repo_previous_status(repo_name, repo_map=repo_map)
                if previous_status is None:
                    self.add_summary("Possibly the first successful push of %s." % repo_name)
                elif previous_status is False:
                    self.add_summary("Previously unsuccessful push of %s is now successful!" % repo_name)
                self._update_repo_previous_status(repo_name, successful_flag=True, repo_map=repo_map, write_update=True)
            else:
                self.add_failure(
                    repo_name,
                    message="Unable to push %s." % repo_name,
                    level=ERROR,
                )
                failure_msg += status + "\n"
                self._update_repo_previous_status(repo_name, successful_flag=False, repo_map=repo_map, write_update=True)
        if not failure_msg:
            repo_map['last_successful_push_timestamp'] = repo_map['last_push_timestamp']
            repo_map['last_successful_push_datetime'] = repo_map['last_push_datetime']
        self._write_repo_update_json(repo_map)
        if failure_msg:
            self.fatal("Unable to push these repos:\n%s" % failure_msg)

    def preflight_upload(self):
        if not self.config.get("copy_logs_post_run", True):
            self.copy_logs_to_upload_dir()

    def upload(self):
        """ Upload the upload_dir according to the upload_config.
            """
        failure_msg = ''
        dirs = self.query_abs_dirs()
        for upload_config in self.config.get('upload_config', []):
            if self.retry(
                self.rsync_upload_directory,
                args=(
                    dirs['abs_upload_dir'],
                ),
                kwargs=upload_config,
            ):
                failure_msg += '%s:%s' % (upload_config['remote_host'],
                                          upload_config['remote_path'])
        if failure_msg:
            self.fatal("Unable to upload to this location:\n%s" % failure_msg)


# __main__ {{{1
if __name__ == '__main__':
    conversion = HgGitScript()
    conversion.run()