Source code for macsypy.search_genes

# -*- coding: utf-8 -*-

################################################################################
# MacSyFinder - Detection of macromolecular systems in protein datasets        #
#               using systems modelling and similarity search.                 #
# Authors: Sophie Abby, Bertrand Néron                                         #
# Copyright © 2014  Institut Pasteur (Paris) and CNRS.                                   #
# See the COPYRIGHT file for details                                           #
#                                                                              #
# MacsyFinder is distributed under the terms of the GNU General Public License #
# (GPLv3). See the COPYING file for details.                                   #
################################################################################



import threading
import logging
_log = logging.getLogger('macsyfinder.' + __name__)
import signal
import sys
import shutil
import os.path
from report import GembaseHMMReport, GeneralHMMReport, OrderedHMMReport 

[docs]def search_genes(genes, cfg): """ For each gene of the list, use the corresponding profile to perform an Hmmer search, and parse the output to generate a HMMReport that is saved in a file after Hit filtering. These tasks are performed in parallel using threads. The number of workers can be limited by worker_nb directive in the config object or in the command-line with the \"-w\" option. :param genes: the genes to search in the input sequence dataset :type genes: list of :class:`macsypy.gene.Gene` objects :param cfg: the configuration object :type cfg: :class:`macsypy.config.Config` object """ worker_nb = cfg.worker_nb if not worker_nb: worker_nb = len(genes) _log.debug("worker_nb = %d" % worker_nb) sema = threading.BoundedSemaphore(value = worker_nb) all_reports = [] def stop(signum, frame): """stop the main process, its threads and subprocesses""" _log.critical('KILL all Processes') proc_grp_id = os.getpgid(0) os.killpg(proc_grp_id, signum) sys.exit(-1 * signum) signal.signal(signal.SIGTERM, stop) def search(gene, all_reports, sema): """ Search gene in the database built from the input sequence file (execute \"hmmsearch\"), and produce a HMMReport :param gene: the gene to search :type gene: a :class:`macsypy.gene.Gene` object :param all_reports: a container to append the generated HMMReport objects :type all_reports: list of `macsypy.report.HMMReport` object (derived class depending on the input dataset type) :param sema: semaphore to limit the number of parallel workers :type sema: a threading.BoundedSemaphore """ with sema: _log.info("search gene %s" % gene.name) profile = gene.profile report = profile.execute() if report: report.extract() report.save_extract() all_reports.append(report) def recover(gene, all_reports, cfg, sema): """ Recover Hmmer output from a previous run, and produce a report :param gene: the gene to search :type gene: a :class:`macsypy.gene.Gene` object :param all_reports: a container to append the generated HMMReport object :type all_reports: list :param cfg: the configuration :type cfg: :class:`macsypy.config.Config` object :param sema: semaphore to limit the number of parallel workers :type sema: a threading.BoundedSemaphore. :return: the list of all HMMReports (derived class depending on the input dataset type) :rtype: list of `macsypy.report.HMMReport` object """ with sema: hmm_old_path = os.path.join(cfg.previous_run, cfg.hmmer_dir, gene.name + cfg.res_search_suffix) _log.info("recover hmm %s" % hmm_old_path) hmm_new_path = os.path.join(cfg.working_dir, cfg.hmmer_dir, gene.name + cfg.res_search_suffix) shutil.copy(hmm_old_path, hmm_new_path) gene.profile.hmm_raw_output = hmm_new_path if cfg.db_type == 'gembase': report = GembaseHMMReport(gene, hmm_new_path, cfg ) elif cfg.db_type == 'ordered_replicon': report = OrderedHMMReport(gene, hmm_new_path, cfg ) else: report = GeneralHMMReport(gene, hmm_new_path, cfg ) if report: report.extract() report.save_extract() all_reports.append(report) #there is only one instance of gene per name but the same instance can be #in all genes several times #hmmsearch and extract should be execute only once per run #so I uniquify the list of gene genes = set(genes) _log.debug("start searching genes") previous_run = cfg.previous_run for gene in genes: if previous_run and os.path.exists(os.path.join(previous_run, cfg.hmmer_dir, gene.name + cfg.res_search_suffix)): t = threading.Thread(target = recover, args = (gene, all_reports, cfg, sema)) else: t = threading.Thread(target = search, args = (gene, all_reports, sema)) t.start() main_thread = threading.currentThread() while threading.activeCount() > 1: for t in threading.enumerate(): if t is main_thread: continue t.join(10) _log.debug("end searching genes") return all_reports