tenfourfox/testing/mozharness/scripts/release/antivirus.py

from multiprocessing.pool import ThreadPool
import os
import re
import sys
import shutil

sys.path.insert(1, os.path.dirname(os.path.dirname(sys.path[0])))

from mozharness.base.python import VirtualenvMixin, virtualenv_config_options
from mozharness.base.script import BaseScript


class AntivirusScan(BaseScript, VirtualenvMixin):
    config_options = [
        [["--product"], {
            "dest": "product",
            "help": "Product being released, eg: firefox, thunderbird",
        }],
        [["--version"], {
            "dest": "version",
            "help": "Version of release, eg: 39.0b5",
        }],
        [["--build-number"], {
            "dest": "build_number",
            "help": "Build number of release, eg: 2",
        }],
        [["--bucket-name"], {
            "dest": "bucket_name",
            "help": "S3 Bucket to retrieve files from",
        }],
        [["--exclude"], {
            "dest": "excludes",
            "action": "append",
            "help": "List of filename patterns to exclude. See script source for default",
        }],
        [["-d", "--download-parallelization"], {
            "dest": "download_parallelization",
            "default": 6,
            "type": "int",
            "help": "Number of concurrent file downloads",
        }],
        [["-s", "--scan-parallelization"], {
            "dest": "scan_parallelization",
            "default": 4,
            "type": "int",
            "help": "Number of concurrent file scans",
        }],
        [["--tools-repo"], {
            "dest": "tools_repo",
            "default": "https://hg.mozilla.org/build/tools",
        }],
        [["--tools-revision"], {
            "dest": "tools_revision",
            "help": "Revision of tools repo to use when downloading extract_and_run_command.py",
        }],
        ] + virtualenv_config_options

    DEFAULT_EXCLUDES = [
        r"^.*tests.*$",
        r"^.*crashreporter.*$",
        r"^.*\.zip(\.asc)?$",
        r"^.*\.log$",
        r"^.*\.txt$",
        r"^.*\.asc$",
        r"^.*/partner-repacks.*$",
        r"^.*.checksums(\.asc)?$",
        r"^.*/logs/.*$",
        r"^.*/jsshell.*$",
        r"^.*json$",
        r"^.*/host.*$",
        r"^.*/mar-tools/.*$",
        r"^.*gecko-unsigned-unaligned.apk$",
        r"^.*robocop.apk$",
        r"^.*contrib.*"
    ]
    CACHE_DIR = 'cache'

    def __init__(self):
        BaseScript.__init__(self,
            config_options=self.config_options,
            require_config_file=False,
            config={
                "virtualenv_modules": [
                    "boto",
                    "redo",
                    "mar",
                ],
                "virtualenv_path": "venv",
            },
            all_actions=[
                "create-virtualenv",
                "activate-virtualenv",
                "get-extract-script",
                "get-files",
                "scan-files",
                "cleanup-cache",
            ],
            default_actions=[
                "create-virtualenv",
                "activate-virtualenv",
                "get-extract-script",
                "get-files",
                "scan-files",
                "cleanup-cache",
            ],
        )
        self.excludes = self.config.get('excludes', self.DEFAULT_EXCLUDES)
        self.dest_dir = self.CACHE_DIR

    def _get_candidates_prefix(self):
        return "pub/{}/candidates/{}-candidates/build{}/".format(
            self.config['product'],
            self.config["version"],
            self.config["build_number"]
        )

    def _matches_exclude(self, keyname):
        for exclude in self.excludes:
            if re.search(exclude, keyname):
                return True
        return False

    def get_extract_script(self):
        """Gets a copy of extract_and_run_command.py from tools, and the supporting mar.py,
        so that we can unpack various files for clam to scan them."""
        remote_file = "{}/raw-file/{}/stage/extract_and_run_command.py".format(self.config["tools_repo"],
                                                                               self.config["tools_revision"])
        self.download_file(remote_file, file_name="extract_and_run_command.py")

    def get_files(self):
        """Pull the candidate files down from S3 for scanning, using parallel requests"""
        from boto.s3.connection import S3Connection
        from boto.exception import S3CopyError, S3ResponseError
        from redo import retry
        from httplib import HTTPException

        # suppress boto debug logging, it's too verbose with --loglevel=debug
        import logging
        logging.getLogger('boto').setLevel(logging.INFO)

        self.info("Connecting to S3")
        conn = S3Connection(anon=True)
        self.info("Getting bucket {}".format(self.config["bucket_name"]))
        bucket = conn.get_bucket(self.config["bucket_name"])

        if os.path.exists(self.dest_dir):
            self.info('Emptying {}'.format(self.dest_dir))
            shutil.rmtree(self.dest_dir)
        os.makedirs(self.dest_dir)

        def worker(item):
            source, destination = item

            self.info("Downloading {} to {}".format(source, destination))
            key = bucket.get_key(source)
            return retry(key.get_contents_to_filename,
                         args=(destination, ),
                         sleeptime=30, max_sleeptime=150,
                         retry_exceptions=(S3CopyError, S3ResponseError,
                                           IOError, HTTPException))

        def find_release_files():
            candidates_prefix = self._get_candidates_prefix()
            self.info("Getting key names from candidates")
            for key in bucket.list(prefix=candidates_prefix):
                keyname = key.name
                if self._matches_exclude(keyname):
                    self.debug("Excluding {}".format(keyname))
                else:
                    destination = os.path.join(self.dest_dir, keyname.replace(candidates_prefix, ''))
                    dest_dir = os.path.dirname(destination)
                    if not os.path.isdir(dest_dir):
                        os.makedirs(dest_dir)
                    yield (keyname, destination)

        pool = ThreadPool(self.config["download_parallelization"])
        pool.map(worker, find_release_files())

    def scan_files(self):
        """Scan the files we've collected. We do the download and scan concurrently to make
        it easier to have a coherent log afterwards. Uses the venv python."""
        self.run_command([self.query_python_path(), 'extract_and_run_command.py',
                          '-j{}'.format(self.config['scan_parallelization']),
                          'clamdscan', '-m', '--no-summary', '--', self.dest_dir])

    def cleanup_cache(self):
        """If we have simultaneous releases in flight an av slave may end up doing another
        av job before being recycled, and we need to make sure the full disk is available."""
        shutil.rmtree(self.dest_dir)


if __name__ == "__main__":
    myScript = AntivirusScan()
    myScript.run_and_exit()