#!/usr/bin/env python

# ----------------------------------------------------------------------------
# Copyright (c) 2015, The Deblur Development Team.
#
# Distributed under the terms of the BSD 3-clause License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import click
from os import makedirs, mkdir, remove
from os.path import join, isfile, exists, isdir
from shutil import rmtree
import logging
import sys
from glob import glob

from deblur.deblurring import (deblur, get_default_error_profile)
from deblur.parallel_deblur import parallel_deblur
from deblur.workflow import (launch_workflow, trim_seqs, dereplicate_seqs,
                             remove_artifacts_seqs,
                             multiple_sequence_alignment,
                             remove_chimeras_denovo_from_seqs,
                             split_sequence_file_on_sample_ids_to_files,
                             build_index_sortmerna,
                             get_files_for_table,
                             create_otu_table,
                             start_log, sequence_generator,
                             remove_artifacts_from_biom_table)

from deblur.support_files import pos_db, neg_db
from deblur import __version__


def error_dist_from_str(ctx, param, value):
    """Validate the error profile is either None or a list of floats
    and return it as a list of floats
    """
    if value is None:
        return value
    if not isinstance(value, str):
        return value
    try:
        # if string with [], remove them
        if value[0] == '[' and value[-1] == ']':
            value = value[1:-1]
        error_dist = list(map(float, value.split(',')))
        return error_dist
    except ValueError:
        raise click.BadParameter('Error distribution must be a comma '
                                 'separated list of maximal error '
                                 'probability per hamming distance')


@click.group()
@click.version_option(version=__version__)
def deblur_cmds():
    # DEBLUR SEQUENCES COMMAND
    pass


@deblur_cmds.command()
@click.argument('seqs_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=True,
                                file_okay=True))
@click.option('--mean-error', '-m', required=False, type=float, default=0.005,
              show_default=True,
              help="The mean per nucleotide error, used for original "
              "sequence estimate. If "
              "not passed typical illumina error rate is used")
@click.option('--error-dist', '-d', required=False, type=str,
              default=get_default_error_profile(),
              show_default=True,
              callback=error_dist_from_str,
              help="A comma separated list of error probabilities for each "
                   "hamming distance. The length of the list determines the "
                   "number of hamming distances taken into account.")
@click.option('--indel-prob', '-i', required=False, type=float, default=0.01,
              help='Insertion/deletion (indel) probability '
                   '(same for N indels)')
@click.option('--indel-max', required=False, type=int, default=3,
              help="Maximal indel number")
@click.option('--log-level', required=False,
              type=click.IntRange(1, 5, clamp=True), default=2,
              show_default=True, help="Level of messages for log file"
              "(range 1-debug to 5-critical")
@click.option('--log-file', required=False,
              type=click.Path(resolve_path=True,
                              readable=True, exists=False,
                              dir_okay=True),
              default='deblur.log',
              show_default=True, help="log file name")
def deblur_seqs(seqs_fp, mean_error, error_dist, indel_prob,
                indel_max, log_level, log_file):
    """Clean read errors from Illumina reads"""
    start_log(level=log_level * 10, filename=log_file)
    logger = logging.getLogger(__name__)
    logger.info('deblur deblur_seqs started on file %s' % seqs_fp)

    # set the error distribution
    if error_dist is None:
        logger.info('using default error profile')
        # set to the default illumina read error profile
        error_dist = get_default_error_profile()

    logger.info('error_dist is : %s' % error_dist)

    seqs = deblur(sequence_generator(seqs_fp), mean_error, error_dist,
                  indel_prob, indel_max)

    output_path = "%s.clean" % seqs_fp
    with open(output_path, 'w') as f:
        for s in seqs:
            f.write(s.to_fasta())


# TRIM LENGTH COMMAND
@deblur_cmds.command()
@click.argument('seqs_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=True,
                                file_okay=True))
@click.argument('output_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=False,
                                file_okay=True))
@click.option('--trim-length', '-t', required=True, type=int,
              help="Sequence trim length")
@click.option('--log-level', required=False,
              type=click.IntRange(1, 5, clamp=True), default=2,
              show_default=True, help="Level of messages for log file"
              "(range 1-debug to 5-critical")
@click.option('--log-file', required=False,
              type=click.Path(resolve_path=True, readable=True,
                              exists=False, dir_okay=True),
              default='deblur.log',
              show_default=True, help="log file name")
def trim(seqs_fp, output_fp, trim_length, log_level, log_file):
    """Trim FASTA sequences"""
    start_log(level=log_level * 10, filename=log_file)
    logger = logging.getLogger(__name__)
    logger.info('deblur deblur_seqs started on file %s' % seqs_fp)
    with open(output_fp, 'w') as out_f:
        for label, seq in trim_seqs(sequence_generator(seqs_fp), trim_length):
            out_f.write(">%s\n%s\n" % (label, seq))


# SEQUENCE DEREPLICATION/SINGLETON REMOVAL COMMAND
@deblur_cmds.command()
@click.argument('seqs_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=True,
                                file_okay=True))
@click.argument('output_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=False,
                                file_okay=True))
@click.option('--min-size', required=False, type=int, default=2,
              show_default=True, help="Discard sequences with an abundance "
              "value smaller than min-size")
@click.option('--log-level', required=False,
              type=click.IntRange(1, 5, clamp=True), default=2,
              show_default=True, help="Level of messages for log file"
              "(range 1-debug to 5-critical")
@click.option('--log-file', required=False,
              type=click.Path(resolve_path=True, readable=True,
                              exists=False, dir_okay=True),
              default='deblur.log',
              show_default=True, help="log file name")
def dereplicate(seqs_fp, output_fp, min_size, log_level, log_file):
    """
    Dereplicate FASTA sequences.

    Dereplicate input FASTA sequences and remove clusters
    with fewer than minimum number of occurrences (set by --min-size).
    """
    start_log(level=log_level * 10, filename=log_file)
    dereplicate_seqs(seqs_fp=seqs_fp,
                     output_fp=output_fp,
                     min_size=min_size)


# ARTIFACT REMOVAL COMMAND
@deblur_cmds.command()
@click.argument('seqs_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=True,
                                file_okay=True))
@click.argument('output_dir', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=False,
                                file_okay=True))
@click.option('--ref-fp', required=False, multiple=True,
              default=[], show_default=False,
              type=click.Path(resolve_path=True, readable=True, exists=True,
                              file_okay=True),
              help="Keep all sequences aligning to this FASTA database "
                   "(for multiple databases, use "
                   "--ref-fp db1.fa --ref-fp db2.fa ..)\n"
                   "default for positive filtering is: %s\n"
                   "default for negative filtering is: %s" %
                   (pos_db, neg_db))
@click.option('--ref-db-fp', required=False, multiple=True,
              type=click.Path(resolve_path=True, readable=True, exists=False,
                              file_okay=True),
              help="Keep all sequences aligning to this indexed "
                   "database. For multiple databases, the order "
                   "must follow that of --ref-fp, for example, "
                   "--ref-db-fp db1.idx --ref-db-fp db2.idx ..")
@click.option('--negate/--only-pos', '-n/', required=False, default=True,
              is_flag=True, show_default=True,
              help=("Use --negate (i.e. True) to discard all sequences "
                    "aligning to the database passed via --neg-ref-fp option. "
                    "Alternatively, use --only-pos (i.e. False) to filter by "
                    "only keeping sequences similar to ones in --pos-ref-fp."))
@click.option('--threads-per-sample', '-a', required=False, type=int,
              default=1, show_default=True,
              help="Number of threads to use per sample (0 to use all)")
@click.option('--log-level', required=False,
              type=click.IntRange(1, 5, clamp=True), default=2,
              show_default=True, help="Level of messages for log file"
              "(range 1-debug to 5-critical")
@click.option('--log-file', required=False,
              type=click.Path(resolve_path=True, readable=True,
                              exists=False, dir_okay=True),
              default='deblur.log',
              show_default=True, help="log file name")
def remove_artifacts(seqs_fp, output_dir, ref_fp, ref_db_fp,
                     negate, threads_per_sample, log_level, log_file):
    """
    Filter artifacts.

    Filter artifacts from input sequences based on database(s)
    provided with the --ref-fp (raw FASTA file) or
    --ref-db-fp (indexed database) options.
    """
    start_log(level=log_level * 10, filename=log_file)
    logger = logging.getLogger(__name__)
    logger.info('---------------------------------')
    logger.info('remove_artifacts')
    logger.info('number of database files %d' % len(ref_fp))
    # get the default values for the ref_fp if empty
    if len(ref_fp) == 0:
        if negate:
            ref_fp = [neg_db]
            logger.debug('Using default negative filtering file %s' % ref_fp)
        else:
            ref_fp = [pos_db]
            logger.debug('Using default positive filtering file %s' % ref_fp)

    if ref_db_fp:
        if len(ref_fp) != len(ref_db_fp):
            logger.critical("The number of FASTA reference databases "
                            "does not match the number of indexed "
                            "reference databases")
            raise ValueError("The number of FASTA reference databases "
                             "does not match the number of indexed "
                             "reference databases")
    else:
        logger.warn('sortmerna database does not exist, '
                    'creating it into dir %s' % output_dir)
        ref_db_fp = build_index_sortmerna(
            ref_fp=ref_fp,
            working_dir=output_dir)

    remove_artifacts_seqs(seqs_fp=seqs_fp,
                          ref_fp=ref_fp,
                          working_dir=output_dir,
                          ref_db_fp=ref_db_fp,
                          negate=negate,
                          threads=threads_per_sample)


# INDEX ARTIFACT DB COMMAND
@deblur_cmds.command()
@click.argument('ref_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=False,
                                file_okay=True))
@click.argument('output_dir', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=False,
                                file_okay=True))
@click.option('--log-level', required=False,
              type=click.IntRange(1, 5, clamp=True), default=2,
              show_default=True, help="Level of messages for log file"
              "(range 1-debug to 5-critical")
@click.option('--log-file', required=False,
              type=click.Path(resolve_path=True, readable=True,
                              exists=False, dir_okay=True),
              default='deblur.log',
              show_default=True, help="log file name")
def build_db_index(ref_fp, output_dir, log_level, log_file):
    """
    Preapare the artifacts database

    Input:
    ref_fp - the fasta sequence database for artifact removal
    output_dir - the directory to where to write the indexed database
    """
    start_log(level=log_level * 10, filename=log_file)
    logger = logging.getLogger(__name__)
    logger.info('---------------------------------')
    logger.info('build_db_index')
    logger.info('for database file %s, into dir %s' % (ref_fp, output_dir))
    build_index_sortmerna(ref_fp=[ref_fp], working_dir=output_dir)
    logger.info('done build_db_index')


# MULTIPLE SEQUENCE ALIGNMENT COMMAND
@deblur_cmds.command()
@click.argument('seqs_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=True,
                                file_okay=True))
@click.argument('output_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=False,
                                file_okay=True))
@click.option('--threads-per-sample', '-a', required=False, type=int,
              default=1, show_default=True,
              help="Number of threads to use per sample (0 to use all)")
@click.option('--log-level', required=False,
              type=click.IntRange(1, 5, clamp=True), default=2,
              show_default=True, help="Level of messages for log file"
              "(range 1-debug to 5-critical")
@click.option('--log-file', required=False,
              type=click.Path(resolve_path=True, readable=True,
                              exists=False, dir_okay=True),
              default='deblur.log',
              show_default=True, help="log file name")
def multiple_seq_alignment(seqs_fp, output_fp, threads_per_sample,
                           log_level, log_file):
    """Multiple sequence alignment"""
    start_log(level=log_level * 10, filename=log_file)
    alignment = multiple_sequence_alignment(seqs_fp,
                                            threads=threads_per_sample)

    with open(output_fp, 'w') as f:
        f.write(alignment.to_fasta())


# DE NOVO CHIMERA REMOVAL COMMAND
@deblur_cmds.command()
@click.argument('seqs_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=True,
                                file_okay=True))
@click.argument('output_fp', required=True,
                type=click.Path(resolve_path=True, readable=True,
                                exists=False, file_okay=True))
@click.option('--log-level', required=False,
              type=click.IntRange(1, 5, clamp=True), default=2,
              show_default=True, help="Level of messages for log file"
              "(range 1-debug to 5-critical")
@click.option('--log-file', required=False,
              type=click.Path(resolve_path=True, readable=True,
                              exists=False, dir_okay=True),
              default='deblur.log',
              show_default=True, help="log file name")
def remove_chimeras_denovo(seqs_fp, output_fp, log_level, log_file):
    """Remove chimeras de novo using UCHIME (VSEARCH implementation)"""
    start_log(level=log_level * 10, filename=log_file)
    remove_chimeras_denovo_from_seqs(seqs_fp, output_fp)


# GENERATE BIOM TABLE COMMAND
@deblur_cmds.command()
@click.argument('seqs_fp', required=True,
                type=click.Path(resolve_path=True, readable=True, exists=True,
                                file_okay=True))
@click.argument('output_biom_fp', required=True,
                type=click.Path(resolve_path=True, readable=True,
                                exists=False, file_okay=True))
@click.option('--min-reads', required=False, type=int, default=10,
              show_default=True, help="In output biom table - keep only "
              "sequences appearing at least min-reads in all samples "
              "combined.")
@click.option('--file_type', required=False, type=str,
              default='.fasta.trim.derep.no_artifacts.msa.deblur.no_chimeras',
              show_default=True,
              help='ending of files to be added to the biom table')
@click.option('--log-level', required=False,
              type=click.IntRange(1, 5, clamp=True), default=2,
              show_default=True, help="Level of messages for log file"
              "(range 1-debug to 5-critical")
@click.option('--log-file', required=False,
              type=click.Path(resolve_path=True, readable=True,
                              exists=False, dir_okay=True),
              default='deblur.log',
              show_default=True, help="log file name")
def build_biom_table(seqs_fp, output_biom_fp, min_reads, file_type, log_level,
                     log_file):
    """Generate a BIOM table from a directory of chimera removed fasta files
    Parameters
    ----------
    seqs_fp : str
      the path to the directory containing the chimera removed fasta files
    output_biom_fp : str
      the path where to save the output biom table files
      ('all.biom', 'reference-hit.biom', 'reference-non-hit.biom')
    file_type : str
      the files type to add to the table
      (default='.trim.derep.no_artifacts.msa.deblur.no_chimeras',
      can be '.fasta' or '.fa' if needed)
    """
    start_log(level=log_level * 10, filename=log_file)
    output_filename = 'all.biom'
    output_fp = join(output_biom_fp, output_filename)
    outputfasta_filename = 'all.seq.fa'
    outputfasta_fp = join(output_biom_fp, outputfasta_filename)

    samples = get_files_for_table(seqs_fp, file_type)
    create_otu_table(output_fp, samples,
                     outputfasta_fp=outputfasta_fp, minreads=min_reads)


# LAUNCH FULL DEBLUR PIPELINE COMMAND
@deblur_cmds.command()
@click.option('--seqs-fp', required=True,
              type=click.Path(resolve_path=True, readable=True, exists=True,
                              file_okay=True, dir_okay=True),
              help=("Either a Demultiplexed FASTA or FASTQ file including all "
                    "samples, or a directory of per-sample FASTA or FASTQ "
                    "files. Gzip'd files are acceptable (e.g., .fastq.gz)."))
@click.option('--output-dir', required=True,
              type=click.Path(resolve_path=True, readable=True, exists=False,
                              dir_okay=True),
              help="Directory path to store output including BIOM table")
@click.option('--trim-length', '-t', required=True, type=int,
              show_default=True,
              help=("Sequence trim length. All reads shorter than the "
                    "trim-length will be discarded. A value of -1 can be "
                    "specified to skip trimming; this assumes all sequences "
                    "have an identical length."))
@click.option('--left-trim-length', required=False, type=int,
              show_default=True, default=0,
              help=("Trim the first N bases from every sequence. A value of 0 "
                    "disables this trim."))
@click.option('--pos-ref-fp', required=False, multiple=True,
              default=[], show_default=False,
              type=click.Path(resolve_path=True, readable=True, exists=True,
                              file_okay=True),
              help=("Positive reference filtering database. "
                    "Keep all sequences permissively aligning to any sequence "
                    "in this FASTA file; these results are stored in the "
                    "reference-hit.biom output file. This defaults to the "
                    "Greengenes 13_8 OTUs at 88% identity. An e-value "
                    "threshold of 10 is used with SortMeRNA against the "
                    "reference. For multiple databases, specify the argument "
                    "multiple times, e.g., --pos-ref-fp db1.fa "
                    "--pos-ref-fp db2.fa"))
@click.option('--pos-ref-db-fp', required=False, multiple=True,
              type=click.Path(resolve_path=True, readable=True, exists=False,
                              file_okay=True),
              help=("An indexed version of the positive filtering database. "
                    "This can be useful to avoid incurring the expense of "
                    "reindexing on every run. If not supplied, deblur will "
                    "index the database. For multiple databases, the order "
                    "must follow that of --pos-ref-fp, for example, "
                    "--pos-ref-db-fp db1.idx --pos-ref-db-fp db2.idx .."))
@click.option('--neg-ref-fp', required=False, multiple=True,
              default=[], show_default=False,
              type=click.Path(resolve_path=True, readable=True, exists=True,
                              file_okay=True),
              help=("Negative (artifacts) filtering database. "
                    "Drop all sequences which align to any record in this "
                    "FASTA file. This defaults to a database composed of "
                    "multiple PhiX genomes and known Illumina adapters. "
                    "For multiple databases, specify the argument mutiple "
                    "times, e.g., --neg-ref-fp db1.fa --neg-ref-fp db2.fa"))
@click.option('--neg-ref-db-fp', required=False, multiple=True,
              type=click.Path(resolve_path=True, readable=True, exists=False,
                              file_okay=True),
              help="An indexed version of the negative filtering database. "
                   "If not supplied, deblur will index the database."
                   "For multiple databases, the order "
                   "must follow that of --neg-ref-fp, for example, "
                   "--neg-ref-db-fp db1.idx --neg-ref-db-fp db2.idx ..")
@click.option('--overwrite', '-w', required=False, type=bool, default=False,
              is_flag=True,
              show_default=True, help="Overwrite output directory if exists.")
@click.option('--mean-error', '-m', required=False, type=float, default=0.005,
              show_default=True,
              help=("The mean per nucleotide error rate, used for original "
                    "sequence estimate. If not passed typical illumina error "
                    "rate is used."))
@click.option('--error-dist', '-d', required=False, type=str,
              default=get_default_error_profile(),
              show_default=True,
              callback=error_dist_from_str,
              help="A comma separated list of error probabilities for each "
                   "Hamming distance. The length of the list determines the "
                   "number of Hamming distances taken into account.")
@click.option('--indel-prob', '-i', required=False, type=float, default=0.01,
              show_default=True,
              help=('Insertion/deletion (indel) probability. This probability '
                    'consistent for multiple indels; there is not an added '
                    'elongation penalty.'))
@click.option('--indel-max', required=False, type=int, default=3,
              show_default=True,
              help="Maximum number of allowed indels.")
@click.option('--min-reads', required=False, type=int, default=10,
              show_default=True,
              help=("Keep only the sequences which appear at least "
                    "min-reads study wide (as opposed to per-sample)."))
@click.option('--min-size', required=False, type=int, default=2,
              show_default=True,
              help=("Keep only sequences which appear at least min-size times "
                    "per-sample (as opposed to study wide)."))
@click.option('--threads-per-sample', '-a', required=False, type=int,
              default=1, show_default=True,
              help="Number of threads to use per sample (0 to use all)")
@click.option('--keep-tmp-files', required=False, type=bool, default=False,
              is_flag=True,
              show_default=True,
              help="Keep temporary files (useful for debugging)")
@click.option('--log-level', required=False,
              type=click.IntRange(1, 5, clamp=True), default=2,
              show_default=True, help="Level of messages for log file"
              "(range 1-debug to 5-critical")
@click.option('--log-file', required=False,
              type=click.Path(resolve_path=True, readable=True,
                              exists=False, dir_okay=True),
              default='deblur.log',
              show_default=True, help="log file name")
@click.option('--jobs-to-start', '-O', required=False, type=int, default=1,
              show_default=True,
              help="Number of jobs to start (if to run in parallel)")
@click.option('--is-worker-thread', required=False, type=bool, default=False,
              is_flag=True,
              show_default=True,
              help=("indicates this is not the main deblur process "
                    "(used by the parallel deblur - do not use manually)"))
def workflow(seqs_fp, output_dir, pos_ref_fp, pos_ref_db_fp,
             neg_ref_fp, neg_ref_db_fp, overwrite,
             mean_error, error_dist, indel_prob, indel_max,
             trim_length, left_trim_length, min_reads, min_size,
             threads_per_sample, keep_tmp_files, log_level, log_file,
             jobs_to_start, is_worker_thread):
    """Launch deblur workflow"""
    start_log(level=log_level * 10, filename=log_file)
    logger = logging.getLogger(__name__)
    logger.warn('deblur version %s workflow started on %s' %
                (__version__, seqs_fp))
    logger.warn('parameters: %s' % locals())

    # set the error distribution
    if error_dist is None:
        logger.info('using default error profile')
        # set to the default illumina read error profile
        error_dist = get_default_error_profile()

    logger.info('error_dist is : %s' % error_dist)

    # Store the name of the split dir if we create it,
    # None if we didn't create a split dir
    # (used for deleting it at the end)
    created_split_dir = None

    # get the default values for the rep_fp if empty
    if len(pos_ref_fp) == 0:
        pos_ref_fp = [pos_db]
        logger.debug('Using default positive filtering file %s' % pos_ref_fp)
    if len(neg_ref_fp) == 0:
        neg_ref_fp = [neg_db]
        logger.debug('Using default negative filtering file %s' % neg_ref_fp)

    # Create output directory
    if not is_worker_thread:
        logger.info('deblur main program started')
        if exists(output_dir):
            if overwrite:
                logger.debug('overwrite is on - deleting directory %s' %
                             output_dir)
                rmtree(output_dir)
            else:
                logger.critical('output directory %s already exists' %
                                output_dir)
                raise OSError("Output directory already exists. Choose a "
                              "different directory or use option "
                              "--overwrite (-w)")
        working_dir = join(output_dir, "deblur_working_dir")
        makedirs(working_dir)
    else:
        logger.info('deblur worker thread started')
        working_dir = join(output_dir, "deblur_working_dir")

    # if it's a worker thread, the seqs_fp parameter
    # is the single fasta file name
    if is_worker_thread:
        input_file_list = [seqs_fp]
    else:
        # if seqs_fp is a dir, use the fasta files in it,
        # otherwise, assume its a post split-libraries fasta file
        # and split it to fasta file per sample
        if isdir(seqs_fp):
            logger.info('processing directory %s' % seqs_fp)
            out_dir_split = seqs_fp
        else:
            # Split demultiplexed FASTA on sample IDs
            out_dir_split = join(output_dir, "split")
            logger.info('splitting file %s to per sample fasta. '
                        'output %s' % (seqs_fp, out_dir_split))
            mkdir(out_dir_split)
            created_split_dir = out_dir_split
            with open(seqs_fp, 'U') as seqs_f:
                split_sequence_file_on_sample_ids_to_files(
                    seqs_f,
                    out_dir_split)

        patterns = ['*.fast[aq]', '*.fast[aq].gz', '*.fna', '*.fq', '*.fna.gz',
                    '*.fq.gz']
        input_file_list = []
        for pattern in patterns:
            input_file_list.extend(glob('%s/%s' % (out_dir_split, pattern)))

    files_to_remove = []
    # Build SortMeRNA indexes
    if not neg_ref_db_fp:
        logger.info('building negative db sortmerna index files')
        neg_ref_db_fp = build_index_sortmerna(ref_fp=neg_ref_fp,
                                              working_dir=working_dir)
    if not pos_ref_db_fp:
        logger.info('building positive db sortmerna index files')
        pos_ref_db_fp = build_index_sortmerna(ref_fp=pos_ref_fp,
                                              working_dir=working_dir)

    ref_db_fp = neg_ref_db_fp
    ref_fp = neg_ref_fp

    # Deblur each sample fasta file
    if jobs_to_start < 2:
        # run in serial mode (no parallelization)
        logger.info('processing per sample fasta files')
        for input_fp in input_file_list:
            if isfile(input_fp):
                deblurred_file_name = launch_workflow(
                    seqs_fp=input_fp, working_dir=working_dir,
                    mean_error=mean_error,
                    error_dist=error_dist, indel_prob=indel_prob,
                    indel_max=indel_max, trim_length=trim_length,
                    left_trim_length=left_trim_length,
                    min_size=min_size, ref_fp=ref_fp, ref_db_fp=ref_db_fp,
                    threads_per_sample=threads_per_sample)
                if deblurred_file_name is None:
                    logger.warn('deblurring failed for file %s' % input_fp)
        logger.info('finished processing per sample fasta files')
    else:
        # we're want to start parallel mode - so need to create
        # the thread shell commands
        logger.info('parallel processing per sample fasta files')
        parallel_deblur(input_file_list, sys.argv, pos_ref_db_fp,
                        neg_ref_db_fp, jobs_to_start)

    if is_worker_thread:
        logger.info('worker thread for %s finished' % seqs_fp)
        return

    # Merge chimera cleaned fasta files to a single biom table
    output_filename = 'all.biom'
    output_fp = join(output_dir, output_filename)
    outputfasta_filename = 'all.seqs.fa'
    outputfasta_fp = join(output_dir, outputfasta_filename)

    samples = get_files_for_table(working_dir)
    create_otu_table(output_fp, samples,
                     outputfasta_fp=outputfasta_fp, minreads=min_reads)

    # also create biom tables with
    # only sequences matching the pos_ref_fp sequences (reference-hit.biom)
    # and only sequences not matching the pos_ref_fp sequences
    # (reference-non-hit.biom)
    tmp_files = remove_artifacts_from_biom_table(output_fp, outputfasta_fp,
                                                 pos_ref_fp, output_dir,
                                                 pos_ref_db_fp,
                                                 threads=threads_per_sample)
    # add the temporary files for the filtering to the remove list
    files_to_remove.extend(tmp_files)

    # Clean up
    if not keep_tmp_files:
        logger.info('Cleaning up temp files')
        for f in files_to_remove:
            remove(f)
        rmtree(working_dir)
        if created_split_dir is not None:
            rmtree(created_split_dir)
    else:
        logger.info('Keeping temp files')
    logger.info('deblur workflow finished')
    logger.info('output saved to %s' % output_fp)
    logger.info('------------------')


if __name__ == '__main__':
    deblur_cmds()
