#
# This is a setup script for "Building Your Own BLAST Database"
# (http://www.cs.umt.edu/~dougr/index_files/bioinfBuildingBLASTdb.htm).
#
# Please read through the code before executing it as
#    perl setup.pl
#
# Author: Jon-Michael Deldin <jon-michael.deldin@umontana.edu>
# Date:   2011-09-18
#

use strict;
use warnings;

#
# configuration
#

# directory to save files to
my $save_directory = 'data';

# database file you'll be querying
my $db_file = 'anthraxDB.fna';

# sequences to download
my @to_download = qw(
  ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Bacillus_anthracis_A0248_uid59385/NC_012659.fna
  ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Bacillus_anthracis_Ames_uid57909/NC_003997.fna
  ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Bacillus_anthracis_CDC_684_uid59303/NC_012581.fna
  ftp://ftp.ncbi.nih.gov/genomes/Bacteria/Bacillus_anthracis_Sterne_uid58091/NC_005945.fna
);

# nucleotide queries to run
my @query_files = qw(
  http://www.cs.umt.edu/~dougr/bioinfWebFiles/Projects/buildOwnBlastDB/queryNuc.txt
  http://www.cs.umt.edu/~dougr/bioinfWebFiles/Projects/buildOwnBlastDB/query.txt
);

#
# download
#
my $os   = $^O;  # special perl variable for the operating system
my $tool = '';   # program to download sequences with

# OS X (darwin) can use `curl -O URL` to download a file
if ($os eq 'darwin') {
    $tool = 'curl -Os';
} elsif ($os eq 'linux') {
    $tool = 'wget';
} else {
    die "ERROR: This can only run on OS X or Linux\n";
}

# create the data directory
safe_exec("mkdir -p $save_directory", "Created data/", "create data/");

# cd into it
chdir $save_directory;

# download each file
for my $f (@to_download) {
    safe_exec("$tool $f", "Downloaded $f", "download $f");
}

#
# make the database (one giant FNA file)
#
# We can do this from the command line with
#  cat a.fna b.fna c.fna > anthraxDB.fna
# or
#  cat *.fna > anthraxDB.fna
#
# To do this in Perl, we need to build a string like "a.fna b.fna
# c.fna" with whatever FNA files are in the current directory. The
# glob() function does just that.
my $fna_files = join(" ", glob("*.fna"));
safe_exec("cat $fna_files > $db_file", "Created $db_file", "create $db_file");

#
# get the query files
#
for my $q (@query_files) {
    safe_exec("$tool $q", "Download $q", "download $q");
}

#
# index the database with BLAST
#
safe_exec("makeblastdb -in $db_file -dbtype nucl", "Indexed DB", "run makeblastdb");

#
# create some stub files (README, src/query_(nuc|prot).pl)
#
chdir '..';
safe_exec("mkdir -p src", "Created src directory", "create src dir");

safe_exec(
    "touch README src/query_nuc.pl src/query_prot.pl",
    "Created stub files",
    "create stub files"
);


# Executes a system command and prints a success or failure message.
#
# Use it like this:
#  safe_exec("curl -O URL", "Downloaded file", "Unable to download file");
#
# @param string COMMAND
# @param string SUCCESS_MESSAGE
# @param string FAILURE_MESSAGE This gets prefixed by "Couldn't "
sub safe_exec {
    my ($cmd, $succ, $fail) = @_;

    `$cmd`;

    # $? is the return value of an external command (thing in
    # backticks). If it's 0, then the command was successful.
    if ($? == 0) {
        print "$succ\n";
    } else {
        die "ERROR: Couldn't $fail\n";
    }
}
