TarHunter Code

#!/usr/bin/perl

#########################################################################
#
#  TarHunter
#
#  Version 2.1
#
#  This program is designed for predicting conserved miRNA target and 
#  target mimics in plants.
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or    
#  (at your option) any later version.
#                                                                
#  This program is distributed in the hope that it will be useful,      
#  but WITHOUT ANY WARRANTY; without even the implied warranty of       
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the       
#  GNU General Public License for more details.
#  
#  Author: Xuan Ma (skyxma@tjnu.edu.cn)
#
#########################################################################

use strict;
use warnings;
use 5.010;
use Getopt::Long qw/:config no_ignore_case/;
use Cwd 'abs_path';
use File::Temp qw/tempfile tempdir/;
use File::Basename;
use IO::File;
use List::Util qw/max min/;
no warnings 'recursion';

my $TarHunter_version = "2.1";
my $fasta_version   = "fasta36";

my $query_mir_list_file = '';
my $species_list_file = '';
my $dbs_folder = '';
my $output_folder = '';

my $program_option = 1;           #default: FASTA
my $mispair_cutoff = 100;         #set loose cutoff
my $seed_mispair_cutoff = 100;    #set loose cutoff
my $mimics_arm_misp_cutoff = 0;   #default: no misp in eTM arms
my $bits_cutoff = 0.1;            #default: 10% of best bitscore
my $nc_len = 100;                 #default: homo_mode nc/CDS 100 bp
my $iden_cutoff = 0.7;            #default: homo_mode identity 70%
my $jobs = 1;                     #default: 1 job
my $threads = 1;                  #default: 1 CPU

my $nonmirbase_file;
my $MUSCLE_alignment_file;
my $score_cutoff;
my $no_orth_mir_search;
my $no_conservation;
my $nc_target;
my $mimics;
my $tab_format;
my $help;


GetOptions (
    "n|non_mirbase=s"  => \$nonmirbase_file,
    "q|qmir=s"         => \$query_mir_list_file,
    "s|spe=s"          => \$species_list_file,
    "b|db=s"           => \$dbs_folder,
    "p|prog=i"         => \$program_option,
    "a|aln=s"          => \$MUSCLE_alignment_file,
    "M|total_misp=i"   => \$mispair_cutoff,
    "m|seed_misp=i"    => \$seed_mispair_cutoff,
    "f|score=f"        => \$score_cutoff,
    "c|iden=f"         => \$iden_cutoff,
    "N|no_conserve"    => \$no_conservation,
    "R|no_ortho_mir"   => \$no_orth_mir_search,
    "G|nc_targ"        => \$nc_target,
    "I|mimics"         => \$mimics,
    "i|mimics_str=i"   => \$mimics_arm_misp_cutoff,
    "L|len=i"          => \$nc_len,
    "j|jobs=i"         => \$jobs,
    "T|threads=i"      => \$threads,
    "t|tab"            => \$tab_format,
    "o|output=s"       => \$output_folder,
    "h|help"           => \$help
);


$score_cutoff = 100 if $mimics;            #eTM: set loose score cutoff
$score_cutoff = 4 if (! $mimics and ! $score_cutoff); #default score: 4


STDERR->say ("\n", "=="x40);
STDERR->say ("TarHunter(version $TarHunter_version)    Plant miRNA target searching tool");
STDERR->say ("=="x40);

my $time = scalar localtime;
STDERR->say ("$time");

my @cmdline = ();
my $buffer;
if (open my $cmd_fh, "<:raw", "/proc/$$/cmdline") {
    read($cmd_fh, $buffer, 8388608); 
    close $cmd_fh;
    @cmdline = split /\0/s, $buffer;
}
STDERR->say ("Command Line: ", join(" ", @cmdline), "\n");

STDERR->say ("-"x6, " Checking arguments and tools ...");

#check arguments
check_arguments();


#check if output folder already exists
if (-e $output_folder) {
    STDERR->say ("Output folder already exists. TarHunter exits.");
    exit;
}


#get TarHunter directory path
my $program_path = abs_path($0);
my $dir = dirname($program_path);


#create temporary folder
system "mkdir $dir/temp" unless -e "$dir/temp";
my $temp_dir = tempdir('TarHunter_job_XXXXXXXXX', DIR => "$dir/temp");
STDERR->say ("Creating temporary folder ($temp_dir).");

system "mkdir $output_folder $temp_dir/cdhit $temp_dir/ortho $temp_dir/cdsnew $temp_dir/pr_db $temp_dir/mimics";

my $usearch_version = "$dir/bin/usearch8.1.1756_i86linux32";
my $muscle_version  = "$dir/bin/muscle3.8.31_i86linux64";
system "chmod a+x $usearch_version" unless -x $usearch_version;
system "chmod a+x $muscle_version" unless -x $muscle_version;

#check tools USEARCH, MUSCLE, MCL, FASTA and RNAhybrid
check_tools();



####################  Reading miRNAs  ####################

#read mir list and species list
STDERR->say ("\n", "-"x6, " Reading miRNA file ...");

if ($nonmirbase_file) {
    system "cat $nonmirbase_file $dir/miRs/miRBase.fa > $temp_dir/cdhit/miRBase.fa";
}

my $mir_list_href = read_list($query_mir_list_file);
my $species_list_href = read_list($species_list_file);
my $num_of_species = scalar (keys %{ $species_list_href});


#produce new miR list and new miR cluster file
if ($nonmirbase_file) {
    system "cd-hit-est -i $temp_dir/cdhit/miRBase.fa -o $temp_dir/cdhit/out -c 0.85 -n 5 -d 40 -g 1 >>$temp_dir/msg.txt 2>&1";
    cdhit_parser();
}

my %mir_newlist = %{$mir_list_href};
orth_mir() unless $num_of_species == 1;


#read mirBase sequence file
my $miRBase;
if ($nonmirbase_file) {$miRBase = "$temp_dir/cdhit/miRBase.fa";}
else                  {$miRBase = "$dir/miRs/miRBase.fa";}
my $mir_seq_href = read_seq_file($miRBase);


#get sequences of new miR list
my %mir_newseq;
for my $id (keys %{$mir_seq_href}) {
    if (exists $mir_newlist{$id}) {
        $mir_newseq{$id} = $mir_seq_href->{$id};
    }
}

if (! %mir_newseq) {
    STDERR->say ( "No miRBase miRNAs. TarHunter exits." );
    exit;
}
undef $mir_seq_href;


 
####################  Orthologous gene clustering  ####################

my ($MUSCLE_aln_href, $all_proteomes_href);
unless (defined $no_conservation or $num_of_species == 1 or
        defined $mimics or defined $nc_target or defined $MUSCLE_alignment_file) {
        
    STDERR->say ("\n", "-"x6, " Orthologous gene clustering ...");
    
    #convert cds dbs to protein dbs
    $all_proteomes_href = generate_pr_dbs();

    #run UBLAST, MCL, MUSCLE
    my $gene_group_href;
    $gene_group_href = run_UBLAST_MCL();
    run_MUSCLE($gene_group_href);
}



####################  miRNA target search  ####################

if ($program_option == 0) {
    STDERR->say ("\n", "-"x6, " TarHunter analysis completed.");
    exit;
}

STDERR->say ("\n", "-"x6, " miRNA target search ...");

#read muscle alignment file
if ( defined $MUSCLE_alignment_file ) {
    $MUSCLE_aln_href = read_seq_file($MUSCLE_alignment_file);
}


#run FASTA and RNAhybrid for each miR
my ($rnahy_output_ID, $fasta_output_ID) = (0, 0);

for my $query_mir (sort keys %mir_newseq) {
    next unless $query_mir =~ /^(...)\-/;
    my $cds_file;
    my @db_files = glob "$dbs_folder/*.fa";
    next unless ($cds_file) = grep { /$1\.fa/ } @db_files;
    $cds_file = basename $cds_file;

    my $mir_tmpfile = IO::File->new("$temp_dir/mir.fa.tmp", 'w') or die $!;
    $mir_tmpfile->say (">$query_mir\n$mir_newseq{$query_mir}");
    $mir_tmpfile->close;

    if ($program_option == 1) {
        run_FASTA($query_mir, "$temp_dir/mir.fa.tmp", "$dbs_folder/$cds_file");
        
    } elsif ($program_option == 2) {
        unless ( -e "$temp_dir/cdsnew/$cds_file\.cdsnew" ) {

            #read cds db
            my $cds_file_href =  read_seq_file("$dbs_folder/$cds_file");
                
            #creat new cds db for RNAhybrid
            STDERR->say ("Producing new $cds_file file for RNAhybrid...");
            my $cdsnew_href = cds_new($cds_file_href);

            my $cdsnew_file = IO::File->new("$temp_dir/cdsnew/$cds_file\.cdsnew", 'w') or die $!;
            foreach (keys %{$cdsnew_href}) {
                $cdsnew_file->say (">$_\n$cdsnew_href->{$_}");
            }
        }
        run_RNAhybrid($query_mir, "$temp_dir/mir.fa.tmp", "$temp_dir/cdsnew/$cds_file\.cdsnew");
    }
}



####################  Conservation filter  ####################

if ($no_conservation or $num_of_species == 1) {
    STDERR->say ("\n", "-"x6, " TarHunter analysis completed."); 
    exit;
}

if ($mimics or $nc_target) { # nc/mimics target conservation
    STDERR->say ("\n", "-"x6, " Finding cross-species conserved target sites ...");
    mimics_nctarg_conservation();
    
} else { # CDS target conservation
    STDERR->say ("\n", "-"x6, " Finding cross-species conserved CDS target sites ...");
    system "sort -k 2,2 $temp_dir/mircluster.tmp > $temp_dir/mircluster.sorted.tmp";

    my $program;
    if ($program_option == 1)    { $program = 'fasta'; }
    elsif ($program_option == 2) { $program = 'rnahy'; }

    conservation("$temp_dir/$program\_out", "$temp_dir/$program\_conserv_out");
    undef $MUSCLE_aln_href;

    if ($tab_format) {
        system "head -n 1 $temp_dir/$program\_conserv_out >> $output_folder/$program\_out_conserv";
        system "tail -n +2 $temp_dir/$program\_conserv_out | sort -u >> $output_folder/$program\_out_conserv";
    } else {
        remove_duplicate("$temp_dir/$program\_conserv_out", "$output_folder/$program\_out_conserv");
    }
}

system "rm -r $temp_dir/cdhit $temp_dir/ortho $temp_dir/cdsnew $temp_dir/pr_db $temp_dir/mimics";
system "rm $temp_dir/*.tmp";
system "rm $temp_dir/*_out" if -e "$temp_dir/*_out";

STDERR->say ("\n", "-"x6, " TarHunter analysis completed.\n");



####################  Subroutines  ####################

sub check_arguments {
    unless ($query_mir_list_file) {
        STDERR->say ("FATAL: query miR list file does not exist.");
        usage();
    }
    unless ($dbs_folder) {
        STDERR->say ("FATAL: dbs folder is not provided.");
        usage();
    }
    unless ($species_list_file) {
        STDERR->say ("FATAL: species list file does not exist.");
        usage();
    }
    unless ($output_folder) {
        STDERR->say ("FATAL: output folder is not provided.");
        usage();
    }
    if ($MUSCLE_alignment_file and !(-e $MUSCLE_alignment_file)) {
        STDERR->say ("FATAL: protein alignment file does not exists.");
        usage();
    }
    if ($nonmirbase_file and !(-e $nonmirbase_file)) {
        STDERR->say ("FATAL: non-miRBase miRNA file does not exists.");
        usage();
    }
    if ($help) { usage(); }
}


sub usage {
my $usage = << "USAGE";

Usage:  
    perl TarHunter.pl -q mir_list -s spe_list -b dbs_dir -o output_dir [Options]

Required arguments:
    -q (--qmir):         miRNA list
    -s (--spe):          species list
    -b (--db):           dbs directory
    -o (--output):       output directory

Options:
    -p (--prog):         programs
                         (1:FASTA, 2:RNAhybrid, 0:neither) [Default:  1 ]
    -a (--aln):          protein alignment file            [Default: off]
    -n (--nonmirbase):   non-miRBase miRNA file            [Default: off]  

    -M (--total_misp):   max. total mispairs               [Default: off]
    -m (--seed_misp):    max. seed mispairs                [Default: off]
    -f (--score):        score cutoff                      [Default:  4 ]

    -G (--nc_targ):      homo_mode conservation filter     [Default: off]
    -c (--iden):         homo_mode target site identity    [Default: 0.7]
    -L (--len):          homo_mode target site length      [Default: 100]

    -I (--mimics):       eTM search                        [Default: off]
    -i (--mimics_str):   eTM stringency
                         (0: strict, 1: relaxed)           [Default:  0 ]

    -N (--no_conserve):  no ortho_mode conservaton filter  [Default: off]
    -R (--no_ortho_mir): no orthologous miRNA search       [Default: off]

    -j (--jobs):         jobs                              [Default:  1 ]
    -T (--threads):      threads                           [Default:  1 ]
    -t (--tab):          tabular format output             [Default: off]
    -h (--help):         help information

USAGE

    print $usage;
    exit;
}



sub check_tools {
    my $CDHIT_command = qx(which cd-hit-est 2>>$temp_dir/msg.txt);
    unless($CDHIT_command) {
        STDERR->say ("Warning: cd-hit-est not found.");
    }
    
    my $MCL_command = qx(which mcl 2>>$temp_dir/msg.txt);
    unless($MCL_command ) {
        STDERR->say ("Warning: MCL not found.");
    }
    
    my $PARALLEL_command = qx(which parallel 2>>$temp_dir/msg.txt);
    unless($PARALLEL_command ) {
        STDERR->say ("Warning: GNU parallel not found.");
    }
    
    my $RNAhybrid_command = qx(which RNAhybrid 2>>$temp_dir/msg.txt);
    unless ($RNAhybrid_command) {
        STDERR->say ("Warning: RNAhybrid not found.");
    }
    
    my $FASTA_command = qx(which $fasta_version 2>>$temp_dir/msg.txt);
    unless ($FASTA_command) {
        STDERR->say ("Warning: $fasta_version not found.");
    }
}


sub read_list {
    my $file = shift;
    my %list;
    my $infile = IO::File->new( $file, 'r' ) or die $!;
    while (<$infile>) {
        chomp; 
        s/\r//;
        s/^\s*|\s*$//;
        next if /^\s*$/;
        $list{$_} = 1;
    }
    return \%list;
}


sub read_seq_file {
    my ($seqfile) = @_;
    my %seqhash;

    local $/ = '>';
    my $infile = IO::File->new($seqfile, 'r') or die $!;
    while (<$infile>) {
        s/\r//g;
        chomp;
        my ($id, $seq) = /(\S+)(?:.*?)\n(.+)/s or next;
        ($seq = uc $seq) =~ s/\n//g;
        $seqhash{$id} = $seq;
    }
    return \%seqhash;
}


sub orth_mir {
    my $mir_group_href;
    if ($nonmirbase_file) { $mir_group_href = ortho_group("$temp_dir/cdhit/mir_cluster.txt") }
    else                  { $mir_group_href = ortho_group("$dir/miRs/mir_cluster.txt") }
    
    #output mircluster.tmp file for conservation analysis
    my $mircluster_tmpfile = IO::File->new("$temp_dir/mircluster.tmp", 'w') or die $!;
    my %all_spe;
    MIR:for my $mir (keys %{$mir_list_href}) {
        my %passed_spe;
        next unless $mir =~ /^(...)-miR/;
        $passed_spe{$1} = 1;
        $all_spe{$1} = 1;
        
        #find group ID of query mir 
        for my $group (keys %{$mir_group_href}) {
            next unless exists $mir_group_href->{$group}{$mir};
            $mircluster_tmpfile->say ("$group\t$mir");
            
            #without orthologous mir search, only output mircluster.tmp file
            next MIR if defined $no_orth_mir_search;
            
            #find the most orthologous mir from the file mir_cluster.txt
            local $/ = '#';
            my $mircluster_file;
            if ($nonmirbase_file) { $mircluster_file = IO::File->new("$temp_dir/cdhit/mir_cluster.txt", 'r') or die $! }
            else                  { $mircluster_file = IO::File->new("$dir/miRs/mir_cluster.txt", 'r') or die $! }
    
            my $omitted = <$mircluster_file>;
            while (<$mircluster_file>) {
                chomp; 
                s/\r//g;
                next unless /^$group\n/;
                my @mir_arr = split/\n/, $_;
            
                for my $mir_member (@mir_arr) {
                    next unless $mir_member =~ /^(...)-miR/;
                    next if exists $passed_spe{$1};
                    next unless exists $species_list_href->{$1};
                    $mir_newlist{$mir_member} = 1;
                    $mircluster_tmpfile->say ("$group\t$mir_member");
                    $passed_spe{$1} = 1;
                }
            }
        }
    }
    if (! %all_spe) { STDERR->say("No miRBase miRNAs found. TarHunter exits."); exit; }
    $num_of_species == 1 if scalar keys %all_spe == 1;
}


sub generate_pr_dbs {
    my %all_proteomes;
    for my $species (keys %{$species_list_href}) {
        my $cds_file;
        my %proteome;
        my @db_files = glob "$dbs_folder/*.fa";
        next unless ($cds_file) = grep { /$species\.fa/ } @db_files;
    
        (my $pr_file = basename $cds_file) =~ s/\.fa/\.pr\.fa/;
    
        #read cds db
        my $cds_href = read_seq_file($cds_file);
    
        #translate cds db
        my $pr_outfile = IO::File->new("$temp_dir/pr_db/$pr_file", 'w') or die $!;
        for my $id (keys %{$cds_href}) {
            $proteome{$id} = translation($cds_href->{$id});
            $pr_outfile->say (">$id\n$proteome{$id}");
        }
        $pr_outfile->close;
    
        #merge all pr dbs
        STDERR->say ("Converting $species cds dbs to protein dbs ...");
        @all_proteomes{keys %proteome} = values %proteome;
    }
    return \%all_proteomes;
}


sub cdhit_parser {
    my ($k, $m, $v);
    my %h;

    open my $in_fh, '<', "$temp_dir/cdhit/out.clstr" or die $!;
    while(<$in_fh>){
        chomp;
        if (/>Cluster/) {
            s/>/#/; s/\s+//;
            $k = $_;
        } else {
            next unless />(...-miR.+)\.\.\..*(\/.+%|\*)$/;
            $m = $1;
            $v = $2 =~ s/%|\///gr;
            $v = 0 if $v eq '*';
            push @{ $h{$k} }, [$m, $v];
        }
    }
    close $in_fh;

    open my $out, '>', "$temp_dir/cdhit/mir_cluster.txt" or die $!;
    for my $i (sort keys %h) {
        print $out "$i\n";
        for my $j ( @{ $h{$i} } ) {
            next unless $j->[1] == 0;
            print $out "$j->[0]\n";
            last;
        }
        for my $l ( sort {$b->[1] <=> $a->[1]} @{ $h{$i} } ) {
            next if $l->[1] == 0;
            print $out "$l->[0]\n";
        }
    }
    close $out;
}


sub translation {
    my $DNA = shift;
    $DNA = uc $DNA;

    my %genetic_code = (
    'ATA' => 'I',    'ATC' => 'I',    'ATT' => 'I',    'ATG' => 'M',    'ACA' => 'T',
    'ACC' => 'T',    'ACG' => 'T',    'ACT' => 'T',    'AAC' => 'N',    'AAT' => 'N',
    'AAA' => 'K',    'AAG' => 'K',    'AGC' => 'S',    'AGT' => 'S',    'AGA' => 'R',
    'AGG' => 'R',    'TCA' => 'S',    'TCC' => 'S',    'TCG' => 'S',    'TCT' => 'S',
    'TTC' => 'F',    'TTT' => 'F',    'TTA' => 'L',    'TTG' => 'L',    'TAC' => 'Y',
    'TAT' => 'Y',    'TAA' => '*',    'TAG' => '*',    'TGC' => 'C',    'TGT' => 'C',
    'TGA' => '*',    'TGG' => 'W',    'CTA' => 'L',    'CTC' => 'L',    'CTG' => 'L',
    'CTT' => 'L',    'CCA' => 'P',    'CCC' => 'P',    'CCG' => 'P',    'CCT' => 'P',
    'CAC' => 'H',    'CAT' => 'H',    'CAA' => 'Q',    'CAG' => 'Q',    'CGA' => 'R',
    'CGC' => 'R',    'CGG' => 'R',    'CGT' => 'R',    'GTA' => 'V',    'GTC' => 'V',
    'GTG' => 'V',    'GTT' => 'V',    'GCA' => 'A',    'GCC' => 'A',    'GCG' => 'A',
    'GCT' => 'A',    'GAC' => 'D',    'GAT' => 'D',    'GAA' => 'E',    'GAG' => 'E',
    'GGA' => 'G',    'GGC' => 'G',    'GGG' => 'G',    'GGT' => 'G',
    );

    my $protein;
    my $i = 0;
    while ( $i <= length($DNA) - 3 ) {
        my $aa = substr($DNA, $i, 3);

        if ( exists $genetic_code{$aa} ) {
            $protein .= $genetic_code{$aa};
        } else {
            $protein .= 'X';
        }
        $i += 3;
    }
    $protein = $1 if $protein =~ /(.+?)\*/; # remove AAs behind 1st stop codon
    return $protein;
}


sub run_UBLAST_MCL {
    STDERR->say ("Running UBLAST ...");
    
    my $db_list = IO::File->new ("$temp_dir/pr_db/db_list.txt", 'w') or die $!;

    #make ublast db (reciprocal ublast method)
    my @pr_dbs = glob "$temp_dir/pr_db/*.pr.fa";
    for my $db1 (@pr_dbs) {
        (my $db1_name = basename $db1) =~ s/\.pr\.fa//;
        system "$usearch_version -makeudb_ublast $db1 -output $temp_dir/pr_db/$db1_name.udb >>$temp_dir/msg.txt 2>&1";
        
        #query each db
        for my $db2 (@pr_dbs) {
            next if $db1 eq $db2;
            (my $db2_name = basename $db2) =~ s/\.pr\.fa//;
            $db_list->print ("$db2\t$temp_dir/pr_db/$db1_name.udb\t$temp_dir/pr_db/$db2_name\_$db1_name\_result\n");
        }
    }
    $db_list->close;
    
    system "parallel --no-notice --jobs $jobs --colsep '\t' -a $temp_dir/pr_db/db_list.txt $usearch_version -ublast {1} -db {2} -threads $threads -evalue 1e-5 -top_hit_only -id 0.5 -blast6out {3} >>$temp_dir/msg.txt 2>&1";

    # reciprocal best hits (RBH) 
    my (%passed, %reciprocal_pairs, %gene_group);
    my @res_files = glob "$temp_dir/pr_db/*result";
    for my $file1 (@res_files) {
        if ($file1 =~ /pr\_db\/(.+?)\_(.+?)\_result/) {
            my ($spe1, $spe2) = ($1, $2);
            next if exists $passed{"$spe2 $spe1"};
            $passed{"$spe1 $spe2"} = 1;
            (my $file2) = grep { /pr_db\/$spe2\_$spe1\_result/ } @res_files or next;
            
            my $pair1 = protein_pairs($file1);
            my $pair2 = protein_pairs($file2);
            
            for (keys %{$pair1}) {
                next unless /(.+)\s(.+)/;
                if (exists $pair2->{"$2 $1"}) {
                    $reciprocal_pairs{"$_"} = 
                        $pair1->{$_} . " " . $pair2->{"$2 $1"}; # get reciprocal hits
                }
            }
        }
    }
    
    my (%bits, %best_bits);
    for (keys %reciprocal_pairs) {
        my ($gene1, $gene2) = split;
        my ($eval1, $bits1, $eval2, $bits2) = split /\s/, $reciprocal_pairs{$_};
        push @{$bits{$gene1}}, $bits1;
        push @{$bits{$gene2}}, $bits2;
    }
    for (keys %bits) { $best_bits{$_} = max @{$bits{$_}}; } # get best bitscore
    
    for (keys %reciprocal_pairs) {
        my ($gene1, $gene2) = split;
        my ($eval1, $bits1, $eval2, $bits2) = split /\s/, $reciprocal_pairs{$_};
        if ($bits1 / $best_bits{$gene1} < $bits_cutoff or
            $bits2 / $best_bits{$gene2} < $bits_cutoff) { # bitscore cutoff
            delete $reciprocal_pairs{$_}; 
        }
    }
    
    # MCL orthologous gene clustering
    STDERR->say ("Running MCL ...");
    my $abc = IO::File->new ("$temp_dir/pr_db/ublast.abc", 'w') or die $!;
    for (keys %reciprocal_pairs) {
        my ($gene1, $gene2) = split;
        my ($eval1, $bits1, $eval2, $bits2) = split /\s/, $reciprocal_pairs{$_};
        print $abc "$gene1\t$gene2\t$bits1\n$gene2\t$gene1\t$bits2\n";
    }
    $abc->close;
    
    system "mcxload -abc $temp_dir/pr_db/ublast.abc --stream-mirror -o $temp_dir/pr_db/ublast.mci -write-tab $temp_dir/pr_db/ublast.tab 2>>$temp_dir/msg.txt";
    system "mcl $temp_dir/pr_db/ublast.mci -I 1.4 -use-tab $temp_dir/pr_db/ublast.tab -o $temp_dir/pr_db/ublast.out 2>>$temp_dir/msg.txt";

    my $group_num;
    my $mcl_result = IO::File->new ("$temp_dir/pr_db/ublast.out", 'r') or die $!;
    while (<$mcl_result>) {
        chomp;
        next if /^\s*$/;
        my $group = 'Group' . ++$group_num;
        my @gene_arr = split;
        $gene_group{$group}{$_} = 1 for (@gene_arr);
    }
    return \%gene_group;
}


sub protein_pairs {
    my $infile = shift;
    my %result;
    my $infile_fh = IO::File->new ($infile, 'r') or die $!; 
    while (<$infile_fh>) {
        my @arr = split;
        
        if ($result{"$arr[0] $arr[1]"}) { # query, subj
            my ($new_evalue, $new_bitscore) = split /\s/, $result{"$arr[0] $arr[1]"};
            next if ($arr[-2] > $new_evalue or $arr[-1] < $new_bitscore);
        }
        $result{"$arr[0] $arr[1]"} = "$arr[-2] $arr[-1]";
    }
    return \%result;
}


sub ortho_group {
    my $file = shift;
    my %ortho;
    my $group;

    my $infile = IO::File->new( $file, 'r' ) or die $!;
    while(<$infile>) {
        next if /^\s+$/;
        chomp;
        s/\r//;
        if (/^#(\w+)/) {
            $group = $1;
            next;
        }
        $ortho{$group}{$_} ++;
    }
    return \%ortho;
}


sub run_MUSCLE {
    my $gene_group_href = shift;
    
    #generate individual orthologous groups;
    STDERR->say ("Generating othologous protein groups ...");

    for my $groupID (keys %{$gene_group_href}) {
        for my $member ( keys %{$gene_group_href->{$groupID}} ) {
            next unless exists $all_proteomes_href->{$member};
            if ($all_proteomes_href->{$member} =~ /\*/) { #omit prs with stop-codons
                STDERR->say ("$member translated pr (+1 frame) contains stop codon, omitted.");
                next;
            }
            
            open my $group_file, '>>', "$temp_dir/ortho/$groupID.fa";
            print $group_file ">$groupID\|$member\n", $all_proteomes_href->{$member}, "\n";
            close $group_file;
        }
    }
    
    undef $all_proteomes_href;
    
    #running MUSCLE
    STDERR->say ("Running MUSCLE ...");
    
    system "find $temp_dir/ortho/ -name '*.fa' | parallel --no-notice --jobs $jobs $muscle_version -quiet -in {} -out {.}.afa";
    
    # store MUSCLE results into hash and output to ortho_aln file
    my $AFA_file = IO::File->new("$output_folder/ortho_aln_MUSCLE.afa", 'w') or die $!;
    my $MUSCLE_file = IO::File->new("find $temp_dir/ortho/ -name '*.afa' | xargs cat |") or die $!;
    my $id  = '';
    my $seq = '';
    while (<$MUSCLE_file>) {
        chomp;
        if (/^>(\S+)/) {
            if ($id ne '') {
                $MUSCLE_aln_href->{$id} = $seq;
                $AFA_file->say (">$id\n$seq");
            }
            $id  = $1;
            $seq = '';
        } else {
            $seq .= $_;
        }
    }
    $MUSCLE_aln_href->{$id} = $seq;
    $AFA_file->say (">$id\n$seq");
    
    STDERR->say ("MUSCLE completed.");
}


sub cds_new { #cds are split into 2kb segments, as RNAhyb corrupts with long seqs
    my $cds_file_href = shift;
    my %cdsnew;

    while ( my ($id, $seq) = each %{$cds_file_href} ) {

        my $line_num = 2 * int(length($seq)/2000);

        my $new_line_num;
        if (length($seq) <= 2000) {
            $new_line_num = 1;
        } elsif (length($seq)%2000 < 15) {
            $new_line_num = $line_num;
        } else {
            $new_line_num = $line_num + 1;
        }

        for my $i (1..$new_line_num) {
            if ($i%2) {
                my $start  = 1000 * ($i - $i%2) + 1;
                my $newseq = substr($seq, $start-1, 2000);
                $cdsnew{"$id\_$start"} = $newseq;

            } else {
                my $start  = 1000 * ($i - $i%2) - 14;
                my $newseq = substr($seq, $start-1, 30);
                $cdsnew{"$id\_$start"} = $newseq;
            }
        }
    }
    return \%cdsnew;
}


sub run_RNAhybrid {
    my ($mirID, $mirfile, $cdsnewfile) = @_;
    
    STDERR->print ("Running RNAhybrid for $mirID ...  ");

    open my $run_rnahyb, "RNAhybrid -d theta -q $mirfile -t $cdsnewfile -u $mispair_cutoff -v $mispair_cutoff -c |" or die $!;
    while (<$run_rnahyb>) {
        $rnahy_output_ID ++;
        chomp;
        RNAhybrid_parser($_);
    }
    STDERR->say ("done.");
}


sub RNAhybrid_parser {
    my $line = shift;
    my @line_arr = split/:/,$line or return -1;
    my ($field7, $field8, $field9, $field10) = @line_arr[7, 8, 9, 10];

    return -1 unless ($field7 and $field8 and $field9 and $field10);
    my $field7_copy  = $field7  =~ s/\s+//gr;
    my $field10_copy = $field10 =~ s/\s+//gr;
    if (length($field7_copy) - 2 > $mispair_cutoff or length($field10_copy) > $mispair_cutoff) {
        return -1;
    }

    my @f7  = split//, reverse $field7;
    my @f8  = split//, reverse $field8;
    my @f9  = split//, reverse $field9;
    my @f10 = split//, reverse $field10;

    #find mir 5' end
    my ($mir_beg, $mir_end);
    for my $i (0..$#f7) {
        next if ($f9[$i]=~/\s/ and $f10[$i]=~/\s/);
        $mir_beg = $i;
        last;
    }
    
    #find mir 3' end
    for my $i (-$#f7..0) {
        next if ($f9[-$i]=~/\s/ and $f10[-$i]=~/\s/);
        $mir_end = -$i;
        last;
    }

    my $cleavage = 'Yes';
    my ($tar, $mir, $aln_symb);
    my ($pos_count, $score, $g3_g5, $mispair, $seed_mispair, $p8_mispair,
        $middle_mispair, $end_mispair, $MFE_ratio) = (0,0,0,0,0,0,0,0,0,0);

    for my $i ($mir_beg..$mir_end) {
        my $tar_loop_nt = $f7[$i];
        my $tar_pair_nt = $f8[$i];
        my $mir_pair_nt = $f9[$i];
        my $mir_loop_nt = $f10[$i];

        if ($tar_loop_nt eq ' ' and $tar_pair_nt eq ' ' and 
            $mir_pair_nt eq ' ' and $mir_loop_nt ne ' ') {
            $mispair ++;
            return -1 if $mispair > $mispair_cutoff;
            
            # output alignment symbol
            $tar .= '-';
            $aln_symb .= ' ';
            $mir .= $mir_loop_nt;
            $pos_count ++;
            
            #score
            if ($pos_count >= 2 and $pos_count <= 12) {
                $score += 2;
            } else {
                $score += 1;
            }
            return -1 if $score > $score_cutoff;
            $g3_g5 = 1 if $pos_count >= 3 and $pos_count <= 5;
            
            #seed mispairs, cleavage
            if ($pos_count >= 2 and $pos_count <= 7) {
                $seed_mispair ++;
                return -1 if $seed_mispair > $seed_mispair_cutoff;
            } elsif ($pos_count == 8 and defined $mimics) {
                $p8_mispair ++;
            } elsif ($pos_count >= 9 and $pos_count <= 11) {
                $cleavage = 'No';
                $middle_mispair ++ if defined $mimics;
            } elsif ($pos_count >= 12 and $pos_count <= 19 and defined $mimics) {
                $end_mispair ++;
                
                #mimics arm (positions 2-8, 12-19) misp <= $mimics_arm_misp_cutoff
                return -1 if $seed_mispair + $p8_mispair + $end_mispair > $mimics_arm_misp_cutoff;
            }

        } elsif ($tar_loop_nt ne ' ' and $tar_pair_nt eq ' ' and 
                $mir_pair_nt eq ' ' and $mir_loop_nt eq ' ') {
            $mispair ++;
            return -1 if $mispair > $mispair_cutoff;
            $tar .= $tar_loop_nt;
            $aln_symb .= ' ';
            $mir .= '-';
            
            if ($pos_count >= 2 and $pos_count <= 11) { # pos 2-11
                $score += 2;
            } else {
                $score += 1;
            }
            return -1 if $score > $score_cutoff;
            $g3_g5 = 1 if $pos_count >= 3 and $pos_count <= 4; # pos 2-4
            
            if ($pos_count >= 2 and $pos_count <= 7) {
                $seed_mispair ++;
                return -1 if $seed_mispair > $seed_mispair_cutoff;
            } elsif ($pos_count == 8 and defined $mimics) {
                $p8_mispair ++;
            } elsif ($pos_count >= 9 and $pos_count <= 10) { # pos9-10
                $cleavage = 'No';
                $middle_mispair ++ if defined $mimics;
            } elsif ($pos_count >= 12 and $pos_count <= 19 and defined $mimics) {
                $end_mispair ++;
                return -1 if $seed_mispair + $p8_mispair + $end_mispair > $mimics_arm_misp_cutoff;
            }

        } elsif ($tar_loop_nt ne ' ' and $tar_pair_nt eq ' ' and 
                $mir_pair_nt eq ' ' and $mir_loop_nt ne ' ') {
            $mispair += 1;
            return -1 if $mispair > $mispair_cutoff;
            $tar .= $tar_loop_nt;
            $aln_symb .= ' ';
            $mir .= $mir_loop_nt;
            $pos_count ++;
            
            if ($pos_count >= 2 and $pos_count <= 12) {
                $score += 2;
            } else {
                $score += 1;
            }
            return -1 if $score > $score_cutoff;
            $g3_g5 = 1 if $pos_count >= 3 and $pos_count <= 5;
            
            if ($pos_count >= 2 and $pos_count <= 7) {
                $seed_mispair ++;
                return -1 if $seed_mispair > $seed_mispair_cutoff;
            } elsif ($pos_count == 8 and defined $mimics) {
                $p8_mispair ++;
            } elsif ($pos_count >= 9 and $pos_count <= 11) {
                $cleavage = 'No';
                $middle_mispair ++ if defined $mimics;
            } elsif ($pos_count >= 12 and $pos_count <= 19 and defined $mimics) {
                $end_mispair ++;
                return -1 if $seed_mispair + $p8_mispair + $end_mispair > $mimics_arm_misp_cutoff;
            }

        } elsif ( ($tar_pair_nt eq 'U' and $mir_pair_nt eq 'G') or
                ($tar_pair_nt eq 'G' and $mir_pair_nt eq 'U') ) {
            $mispair += 1;
            return -1 if $mispair > $mispair_cutoff;
            $tar .= $tar_pair_nt;
            $aln_symb .= 'o';
            $mir .= $mir_pair_nt;
            $pos_count ++;
            
            if ($pos_count >= 2 and $pos_count <= 12) {
                $score += 1;
            } else {
                $score += 0.5;
            }
            return -1 if $score > $score_cutoff;
            $g3_g5 = 1 if $pos_count >= 3 and $pos_count <= 5;
            
            if ($pos_count >= 2 and $pos_count <= 7) {
                $seed_mispair ++;
                return -1 if $seed_mispair > $seed_mispair_cutoff;
            } elsif ($pos_count == 8 and defined $mimics) {
                $p8_mispair ++;
            } elsif ($pos_count >= 9 and $pos_count <= 11) {
                $cleavage = 'No';
                $middle_mispair ++ if defined $mimics;
            } elsif ($pos_count >= 12 and $pos_count <= 19 and defined $mimics) {
                $end_mispair ++;
                return -1 if $seed_mispair + $p8_mispair + $end_mispair > $mimics_arm_misp_cutoff;
            }

        } else {
            $tar .= $tar_pair_nt;
            $aln_symb .= '|';
            $mir .= $mir_pair_nt;
            $pos_count ++;
        }
    }
    
    #mimics positions 9-11 misp <= 5
    if ( defined $mimics and 
        ($cleavage eq 'Yes' or $middle_mispair > 5 or 
        $seed_mispair + $p8_mispair + $end_mispair > $mimics_arm_misp_cutoff) ) {#reset if needed
        return -1;
    }
        
    my $tarID = $line_arr[0] =~ s/(.+)\_(\d+)$/$1/r;
    my $split_pos = $2;
    my $mirID = $line_arr[2];
    my $rnahyb_pos = $line_arr[6];
    my $rna_start_pos = $split_pos + $rnahyb_pos;

    $tar = reverse $tar;      #change to 5' 3'
    my $mir_r = reverse $mir; #change to 3' 5'
    $aln_symb = reverse $aln_symb;
    
    $score ++ if $g3_g5;
    return -1 if $score > $score_cutoff;
    
    #output detailed info
    open my $rnahyb_details, '>>', "$output_folder/rnahy_out_details" or die $!;
    if ($tab_format) {
        print $rnahyb_details "\#R$rnahy_output_ID\t";
        print $rnahyb_details "$tarID\t$tar\t$mirID\t$mir\t$mispair\t$score\t";
        print $rnahyb_details "$seed_mispair\t$cleavage";
        print $rnahyb_details "\t$rna_start_pos\n";
        
    } else {
        print $rnahyb_details "\#R$rnahy_output_ID", "="x60, "\n";
        printf $rnahyb_details "%-35s\t%s\n", $tarID, "5' $tar 3'";
        printf $rnahyb_details "%-35s\t%s\n", "", "   $aln_symb";
        printf $rnahyb_details "%-35s\t%s\n\n", $mirID, "3' $mir_r 5'";
        printf $rnahyb_details "%-35s\t%s\n", "Total mispairs:", $mispair;
        printf $rnahyb_details "%-35s\t%s\n", "Score:", $score;
        printf $rnahyb_details "%-35s\t%s\n", "Seed mismpairs:",  $seed_mispair;
        printf $rnahyb_details "%-35s\t%s\n", "Cleavage:", $cleavage;
        printf $rnahyb_details "%-35s\t%s\n\n", "Position:", $rna_start_pos;
    }
    close $rnahyb_details;

    return -1 if defined $mimics;

    #pr start-end positions
    my $tar_copy = $tar =~ s/[^\w]//gr;
    my $rna_end_pos = $rna_start_pos + length($tar_copy) - 1;
    my $pr_start_pos = ($rna_start_pos - $rna_start_pos%3) / 3 + 1;
    my $pr_end_pos   = ($rna_end_pos - $rna_end_pos%3) / 3;

    #find algnment positions of pr start-end 
    my ($aln_pr_start, $aln_pr_end, $group);
    for my $k (keys %{$MUSCLE_aln_href}) {
        if ($k =~ /(\S+?)\|$tarID/) {
            ($aln_pr_start, $aln_pr_end) = 
            aln_find_pos($MUSCLE_aln_href->{$k}, $pr_start_pos, $pr_end_pos);
            $group = $1;

            #output rnahy_out file for conservation analysis
            open  my $rnahyb_output, '>>', "$temp_dir/rnahy_out" or die $!;
            print $rnahyb_output "$mirID\t$tarID\t$aln_pr_start\t",
                            "$aln_pr_end\t$group\tR$rnahy_output_ID\n";
            close $rnahyb_output;
        }
    }
}


sub aln_find_pos {
    my ($gene_seq, $start, $end) = @_;

    my $segment1 = substr($gene_seq, 0, $start) =~ s/-//gr;
    my $offset  = $start - length($segment1);
    my $segment2  = substr($gene_seq, $start);
    
    my ($i, $newpos) = (0,0);
    if ($offset > 0) {
        while($segment2 =~ /\w/g) {
            $i++;
            $newpos = pos($segment2);
            last if $i == $offset;
        }
    }
    my $newstart = $start + $newpos;
    
    $offset = $end - $start;
    my $segment3 = substr($gene_seq, $newstart);
    my $j;
    while($segment3 =~ /\w/g) {
        $j++;
        $newpos = pos($segment3);
        last if $j == $offset;
    }
    my $newend = $newstart + $newpos;
    
    return ($newstart, $newend);
}


sub run_FASTA {
    my ($mirName, $mirfile, $cdsfile) = @_;
    
    my $mirAbbr = substr($mirName, 0, 5);
    my %fasta_hash;
    my $find_first = 0;
    my ($geneName, $rna_pair_beg_pos, $rna_pair_end_pos, $rna_start_pos, $rna_end_pos, 
        $mir_end_pos, $mir_rc, $aln, $aln_copy, $tar, $aln_line, $tar_line, $left_pos);
    my (@mir, @tar, @aln);

    STDERR->print ("Running FASTA for $mirName ...  ");
    
    open my $run_fasta, "$fasta_version -A -n -Q -i -U -T $threads -E 100000 $mirfile $cdsfile 1 2>/dev/null |" or die $!;
    while (<$run_fasta>) {
        $fasta_output_ID ++;
        chomp;
        unless ($find_first) {
            next unless /^>>(\S+)/;
            $geneName = $1;
            $find_first = 1;
            next;
            
        } else {
            if (/^>>(\S+)/) {
                $geneName = $1;

            } elsif (/Smith-Waterman.+(\d+)-\d+:(\d+)-(\d+)/) {
                $mir_end_pos  = $1;
                $rna_pair_beg_pos = $2; 
                $rna_pair_end_pos = $3;

            } elsif (/^($mirAbbr\-\s+)(\S+)/) {
                $mir_rc = $2;
                
                $left_pos = length($1);
                $aln_line = <$run_fasta>;  #aln symbol
                if ($aln_line =~ /^(\s+)(\S.*\S)(\s*)$/) {
                    $aln = $2;
                    $aln_copy = $aln =~ s/://gr;
                }
                
                $tar_line = <$run_fasta>;
                $tar = substr($tar_line, $left_pos, length($mir_rc));
                $tar =~ s/\s/-/g;    #some targs have unpairings at 5' and 3' end
                
                #primary filtering
                next if (defined $aln_copy and length($aln_copy) > $mispair_cutoff);
                
                FASTA_parser($mirName, $geneName, $mir_rc, $tar, 
                            $rna_pair_beg_pos, $rna_pair_end_pos);
            }
        }
    }
    close $run_fasta;
    STDERR->say ("done.");
}


sub FASTA_parser {
    my ($mirID, $geneID, $mir_rc, $tar, $rna_start_pos, $rna_end_pos) = @_;
    my $mir_rc_copy = $mir_rc =~ s/-//gr;
    return -1 if length($mir_rc_copy) < 10; #remove some low-quality results
    
    (my $mir = reverse uc $mir_rc) =~ tr/AUGC/UACG/;
    $tar = reverse uc $tar;
    my @mir = split //, $mir; #5'-3'
    my @tar = split //, $tar; #3'-5'
    my $aln_symb;
    my $cleavage = 'Yes';

    my ($count, $score, $g3g5, $total_misp, $seed_misp, $p8_misp, $middle_misp, $end_misp)
        = (0,0,0,0,0,0,0,0);

    my %pairing = (
    #class1:pairing, 2:G:U, 3:mismatch, 4:bulge miR strand, 5:bulge tar strand
    'AU' => 'class1',    'UA' => 'class1',    'GC' => 'class1',    'CG' => 'class1',
    'UG' => 'class2',    'GU' => 'class2',    'AG' => 'class3',    'AC' => 'class3',
    'UC' => 'class3',    'GA' => 'class3',    'CA' => 'class3',    'CU' => 'class3',
    'AA' => 'class3',    'UU' => 'class3',    'GG' => 'class3',    'CC' => 'class3', 
    'A-' => 'class4',    'U-' => 'class4',    'G-' => 'class4',    'C-' => 'class4', 
    '-A' => 'class5',    '-U' => 'class5',    '-G' => 'class5',    '-C' => 'class5',
    );
    
    for my $i (0..$#mir) {
        my $mir_nt = $mir[$i];
        my $tar_nt = $tar[$i];
        return -1 unless defined $pairing{"$mir_nt$tar_nt"};
        if ($pairing{"$mir_nt$tar_nt"} eq 'class1') {
            $count ++;
            $aln_symb .= '|';
            
        } elsif ($pairing{"$mir_nt$tar_nt"} eq 'class2') {
            $count ++;
            $total_misp ++;
            return -1 if $total_misp > $mispair_cutoff;
            $aln_symb .= 'o';
            
            #Score
            if ($count >= 2 and $count <= 12) {
                $score += 1;
             } else {
                $score += 0.5;
            }
            return -1 if $score > $score_cutoff;
            $g3g5 = 1 if $count >= 3 and $count <= 5;
            
            #seed mispairs, cleavage
            if ($count >= 2 and $count <= 7) {
                $seed_misp ++;
                return -1 if $seed_misp > $seed_mispair_cutoff;
            } elsif ($count == 8 and defined $mimics) {
                $p8_misp ++;
            } elsif ($count >= 9 and $count <= 11) {
                $cleavage = 'No';
                $middle_misp ++ if defined $mimics;
            } elsif ($count >= 12 and $count <= 19 and defined $mimics) {
                $end_misp ++;
                
                #mimics arm (positions 2-8, 12-19) misp <= $mimics_arm_misp_cutoff
                return -1 if $seed_misp + $p8_misp + $end_misp > $mimics_arm_misp_cutoff; 
            }
            
        } elsif ($pairing{"$mir_nt$tar_nt"} eq 'class3') {
            $count ++;
            $total_misp ++;
            return -1 if $total_misp > $mispair_cutoff;
            $aln_symb .= ' ';
            
            if ($count >= 2 and $count <= 12) {
                $score += 2;
            } else {
                $score += 1;
            }
            return -1 if $score > $score_cutoff;
            $g3g5 = 1 if $count >= 3 and $count <= 5;
            
            if ($count >= 2 and $count <= 7) {
                $seed_misp ++;
                return -1 if $seed_misp > $seed_mispair_cutoff;
            } elsif ($count == 8 and defined $mimics) {
                $p8_misp ++;
            } elsif ($count >= 9 and $count <= 11) {
                $cleavage = 'No';
                $middle_misp ++ if defined $mimics;
            } elsif ($count >= 12 and $count <= 19 and defined $mimics) {
                $end_misp ++;
                return -1 if $seed_misp + $p8_misp + $end_misp > $mimics_arm_misp_cutoff;
            }
            
        } elsif ($pairing{"$mir_nt$tar_nt"} eq 'class4') {
            $count ++;
            $total_misp ++;
            return -1 if $total_misp > $mispair_cutoff;
            $aln_symb .= ' ';
            
            if ($count >= 2 and $count <= 12) {
                $score += 2;
            } else {
                $score += 1;
            }
            return -1 if $score > $score_cutoff;
            $g3g5 = 1 if $count >= 3 and $count <= 5;
            
            if ($count >= 2 and $count <= 7) {
                $seed_misp ++;
                return -1 if $seed_misp > $seed_mispair_cutoff;
            } elsif ($count == 8 and defined $mimics) {
                $p8_misp ++;
            } elsif ($count >= 9 and $count <= 11) {
                $cleavage = 'No';
                $middle_misp ++ if defined $mimics;
            } elsif ($count >= 12 and $count <= 19 and defined $mimics) {
                $end_misp ++;
                return -1 if $seed_misp + $p8_misp + $end_misp > $mimics_arm_misp_cutoff;
            }

        } elsif ($pairing{"$mir_nt$tar_nt"} eq 'class5') {
            $total_misp ++;
            return -1 if $total_misp > $mispair_cutoff;
            $aln_symb .= ' ';
            
            if ($count >= 2 and $count <= 11) {
                $score += 2;
            } else {
                $score += 1;
            }
            return -1 if $score > $score_cutoff;
            $g3g5 = 1 if $count >= 3 and $count <= 4;
            
            if ($count >= 2 and $count <= 7) {
                $seed_misp ++;
                return -1 if $seed_misp > $seed_mispair_cutoff;
            } elsif ($count == 8 and defined $mimics) {
                $p8_misp ++;
            } elsif ($count >= 9 and $count <= 10) {
                $cleavage = 'No';
                $middle_misp ++ if defined $mimics;
            } elsif ($count >= 11 and $count <= 19 and defined $mimics) {# pos 12-19
                $end_misp ++;
                return -1 if $seed_misp + $p8_misp + $end_misp > $mimics_arm_misp_cutoff;
            }
        }
    }

    $score ++ if $g3g5;
    return -1 if $score > $score_cutoff;

    #mimics positions 9-11 misp <= 5
    if ( defined $mimics and 
        ($cleavage eq 'Yes' or $middle_misp > 5 or 
        $seed_misp + $p8_misp + $end_misp > $mimics_arm_misp_cutoff) ) {
        return -1;
    }

    $tar      = reverse $tar; #change to 5'-3'
    my $mir_r = reverse $mir; #change to 3'-5'
    $aln_symb = reverse $aln_symb;

    #output detailed info
    open my $fasta_details, '>>', "$output_folder/fasta_out_details" or die $!;
    if ($tab_format) {
        print $fasta_details "\#F$fasta_output_ID\t";
        print $fasta_details "$geneID\t$tar\t$mirID\t$mir\t$total_misp\t",
                            "$score\t$seed_misp\t$cleavage\t$rna_start_pos\n";
    } else {
        print $fasta_details "\#F$fasta_output_ID", "=="x40, "\n";
        printf $fasta_details "%-35s\t%s\n", $geneID, "5' $tar 3'";
        printf $fasta_details "%-35s\t%s\n", "", "   $aln_symb";
        printf $fasta_details "%-35s\t%s\n\n", $mirID, "3' $mir_r 5'";
        printf $fasta_details "%-35s\t%s\n", "Total mispairs:", $total_misp;
        printf $fasta_details "%-35s\t%s\n", "Score:", $score;
        printf $fasta_details "%-35s\t%s\n", "Seed mispairs:", $seed_misp;
        printf $fasta_details "%-35s\t%s\n", "Cleavage:", $cleavage;
        printf $fasta_details "%-35s\t%s\n\n", "Position:", $rna_start_pos;
    }
    close $fasta_details;
    
    return -1 if defined $mimics;
    
    my $pr_start_pos = ($rna_start_pos - $rna_start_pos%3) / 3 + 1;
    my $pr_end_pos   = ($rna_end_pos - $rna_end_pos%3) / 3;
    
    #output fasta_output file for conservation analysis
    my ($pr_aln_start, $pr_aln_end, $group);
    for my $k (keys %{$MUSCLE_aln_href}) {
        if ($k =~ /(\S+?)\|$geneID$/) { 
            ($pr_aln_start, $pr_aln_end) = 
                aln_find_pos($MUSCLE_aln_href->{$k}, $pr_start_pos, $pr_end_pos);
            $group = $1;
            open  my $fasta_output, '>>', "$temp_dir/fasta_out" or die $!;
            print $fasta_output "$mirID\t$geneID\t$pr_aln_start\t",
                            "$pr_aln_end\t$group\tF$fasta_output_ID\n";
            close $fasta_output;
        }
    }
}


sub conservation {
    my ($infile, $outfile) = @_;
    unless (-e $infile) {
        STDERR->say ("\nNo hits found.\n", "-"x6, " TarHunter analysis completed.");
        exit;
    }
    
    my $infile2 = basename $infile =~ s/(.+)/$1\_details/r;
    system " sort -k 1,1 $infile > $infile\.sorted.tmp ";
    system " join -1 1 -2 2 $infile\.sorted.tmp  $temp_dir/mircluster.sorted.tmp > $infile\.merged.tmp ";
    #merged file format: mirID-geneID-start-end-geneGroup-siteNum-mirCluster 

    my $result_href = parse_merged_file("$infile\.merged.tmp");
    
    #store detailed info into hash
    my %hash_details;
    open my $rnahy_fasta_details, "$output_folder/$infile2" or die $!;
    local $/ = '#';
    my $discarded = <$rnahy_fasta_details>;
    while (<$rnahy_fasta_details>) {
        chomp;
        my ($id, $details);
        if ($tab_format) {
            ($id, $details) = /(.+?)\t(.+)/s or next;
        } else {
            ($id, $details) = /(.+?)\n(.+)/s or next;
        }
        $id =~ s/=//g;
        $hash_details{$id} = $details;
    }
    close $rnahy_fasta_details;
    local $/ = "\n";
    
    #sequenceID as key, speciesID as value
    my %hash_species;
    for my $species (keys %{$species_list_href}) {
        for my $file(glob "$dbs_folder/*.fa") {
            next unless $file =~ /$species\.fa/;
            my $spe_hash = IO::File->new ($file, 'r') or die $!;
            while (<$spe_hash>) {
                chomp;
                $hash_species{$1} = $species if />(\S+)(.*)/;
            }
        }
    }
    
    if ($infile eq "$temp_dir/rnahy_out") {
        STDERR->say ("Outputing RNAhybrid results ...");
    } elsif ($infile eq "$temp_dir/fasta_out") {
        STDERR->say ("Outputing FASTA results ...");
    }
    
    my ($identity_cutoff_pr, $identity_cutoff_nt) = (0.55, 0.65);
    my $final_output = IO::File->new($outfile, 'w') or die $!;
    if ($tab_format) {  #tab format header
        print $final_output "miRID\tmiR sequence\ttargetID\ttarget sequence\tTotal mispair\t",
            "Score\tSeed mispair\tCleavage\tPosition\tSpecies\tIdentity(nt)\%\tIdentity(aa)\%\n";
    }
    
    for my $gene_group (keys %{$result_href}) {
        for my $mir_cluster (keys %{$result_href->{$gene_group}}) { 
            for my $site_num (keys %{$result_href->{$gene_group}{$mir_cluster}}) {
                my $inside_group = $result_href->{$gene_group}{$mir_cluster}{$site_num};
                
                #inside a unique group
                my (@start_arr, @end_arr, %group_species);
                for my $element (keys %{$inside_group}) {
                    my $group_members = $inside_group->{$element};
                    my $species = $hash_species{$group_members->[1]}; #$group_member->[1]:geneID
                    push @start_arr, $group_members->[2];
                    push @end_arr, $group_members->[3];
                    $group_species{$species} ++;
                }
                my $num_of_species = scalar (keys %group_species);
                next if $num_of_species < 2; # at least 2 species
                
                # target site aa boundary
                my $min = min @start_arr;
                my $max = max @end_arr;
                
                my $aa_tar_region;
                open my $site_pr_file, '>>', "$temp_dir/tar_site_pr.fa" or die $!;
                open my $site_nt_file, '>>', "$temp_dir/tar_site_nt.fa" or die $!;
                for my $element (keys %{$inside_group}) {
                    #output target site aa seqs
                    my $gene_id = $inside_group->{$element}->[1];
                    $aa_tar_region = extract_site($min, $max, $gene_id, $gene_group);
                    $aa_tar_region =~ s/-//g;
                    print $site_pr_file ">$gene_id\n$aa_tar_region\n";
                    
                    #output target site nt seqs
                    my (@details_arr, @tar_nt_arr, $tar_nt, $aln_symb);
                    if ($tab_format) {
                        @details_arr = split/\t/, $hash_details{$element};
                        $tar_nt = $details_arr[1]; #site nt seq 5'-3'
                    } else {
                        @details_arr = split/\n/, $hash_details{$element};
                        @tar_nt_arr = split/\s+/, $details_arr[0];
                        $tar_nt = $tar_nt_arr[-2]; #site nt seq 5'-3'
                    }
                    print $site_nt_file ">$gene_id\n$tar_nt\n";
                }
                close $site_pr_file;
                close $site_nt_file;
                
                my ($aln_pr, $site_identity_pr) = aln_identity("$temp_dir/tar_site_pr.fa");
                my ($aln_nt, $site_identity_nt) = aln_identity("$temp_dir/tar_site_nt.fa");
                system "rm $temp_dir/tar_site_nt.fa $temp_dir/tar_site_pr.fa";
                
                next if $site_identity_pr <= $identity_cutoff_pr;
                next if $site_identity_nt <= $identity_cutoff_nt;
                
                my $rounded_identity_nt = sprintf("%.1f", $site_identity_nt * 100);
                my $rounded_identity_pr = sprintf("%.1f", $site_identity_pr * 100);
                
                print $final_output "=="x40, "\n" unless $tab_format;
                
                if ($tab_format) {
                    for my $element (sort {$inside_group->{$a}->[0] cmp $inside_group->{$b}->[0]} 
                                keys %{$inside_group}) {
                        chop $hash_details{$element};
                        print $final_output "$hash_details{$element}\t";
                        print $final_output "$num_of_species\t";
                        print $final_output "$rounded_identity_nt\t$rounded_identity_pr\n";
                    }
                } else {
                    #output details
                    for my $element (sort {$inside_group->{$a}->[0] cmp $inside_group->{$b}->[0]}
                                 keys %{$inside_group}) {
                        print $final_output "$hash_details{$element}\n";
                    }
                    
                    print $final_output "Conserved sites found in $num_of_species species.\n\n";
                    
                    #output target site nt alignment
                    print $final_output "Target site nucleotide acid identity: ", 
                                        $rounded_identity_nt, "\%\n";
                    print $final_output "$aln_nt\n";
                    
                    #output target site aa alignment
                    print $final_output "Target site aa motif identity: ",
                                        $rounded_identity_pr, "\%\n";
                    print $final_output "$aln_pr\n";
                }
            }
        }
    }
}


sub aln_identity {
    my $file = shift;
    my ($identity, $first) = (0,0);
    my ($aln_detail, $len, $symb, %duplicate);
    open my $site_aln_identity, "$muscle_version -in $file -clwstrict -quiet |" or die $!;

    while (<$site_aln_identity>) {
        chomp;
        next if /^CLUSTAL/;
        next if /^\s*$/;
        next if exists $duplicate{$_};
        $duplicate{$_} = 1;
        $aln_detail .= "$_\n"; #target site alignment
        unless ($first) {
            if (/^((\S+?)\s+)\S+/) {
                $len = length $1;
                $first = 1;
            }
        } else {
            next unless /\.|\:|\*/;
            $symb = substr($_, $len);
            my $symb_copy = $symb =~ s/\*//gr;
            $identity = 1-length($symb_copy)/length($symb);
        }
    }
    return ($aln_detail, $identity);
}


sub parse_merged_file {
    my $file = shift;
    my %hash;
    my ($first_in_all, $first_in_group, $siteNo, $aa_shift) = (1, 1, 0, 2);
    my ($curr, $prev);

    open my $merged_file, qq(sort -k 5,5  -k 7,7  -k 3n,3n  -k 4n,4n  $file |); 
    while ($curr = <$merged_file>) {
        $curr =~ s/\r//g;
        chomp $curr;
        next if $curr =~ /^\s+$/;
        if ($first_in_all) {  #first in all
            $prev = $curr;
            $first_in_all = 0;
            next;
        }

        my @a = split/\s+/, $prev;
        my @b = split/\s+/, $curr;
        my ($a2, $a3, $a4, $a5, $a6) = @a[2..6];
        my ($b2, $b3, $b4, $b5, $b6) = @b[2..6];

        if ($a4 eq $b4 and $a6 eq $b6 and abs($a2 - $b2) <= $aa_shift and
            abs($a3 - $b3) <= $aa_shift and $first_in_group ) { #first in group
            $siteNo ++;
            $hash{$a4}{$a6}{$siteNo}{$a5} = [@a[0..3]];  #mirID-geneID-start-end
            $hash{$a4}{$a6}{$siteNo}{$b5} = [@b[0..3]];  
            $first_in_group = 0;

        } elsif ($a4 eq $b4 and $a6 eq $b6 and abs($a2 - $b2) <= $aa_shift and
            abs($a3 - $b3) <= $aa_shift and $first_in_group == 0) {
            $hash{$a4}{$a6}{$siteNo}{$b5} = [@b[0..3]];

        } else {
            $first_in_group = 1;
        }
        $prev = $curr;
    }
    close $merged_file;
    return \%hash;
}


sub extract_site {
    my ($min, $max, $gene, $group) = @_;
    my ($tarsite_region, $region, $reg1, $reg2);
    my $seq = $MUSCLE_aln_href->{"$group\|$gene"};

    # target site aa motif
    $tarsite_region = substr($seq, $min - 1, $max - $min + 1);
    return $tarsite_region;
}


sub remove_duplicate {
    my ($infile, $outfile) = @_;
    my %result;
    my $delimiter = "=="x40;
    local $/ = "$delimiter";
    my $infile_fh = IO::File->new ($infile, 'r') or die $!; 
    while (<$infile_fh>) {
        chomp;
        next if /^\s*$/;
        $result{$_} = 1;
    }
    $infile_fh->close;
    
    my $outfile_fh = IO::File->new ($outfile, 'w') or die $!;
    for my $group (keys %result) {
        print $outfile_fh "$delimiter\n$group";
    }
}


sub mimics_nctarg_conservation {

    local $/ = '#';
    my %mir_group;
    my $mir_cluster_file;
    if ($nonmirbase_file) { $mir_cluster_file = IO::File->new ("$temp_dir/cdhit/mir_cluster.txt") }
    else                  { $mir_cluster_file = IO::File->new ("$dir/miRs/mir_cluster.txt", 'r') }
    my $omitted = <$mir_cluster_file>;
    while(<$mir_cluster_file>) {
        chomp;
        s/\r//g;
        my @arr = split/\n/, $_;
        for my $member (@arr[1..$#arr]) {
            next if $member =~ /^\s*$/;
            $member =~ s/^\s*|\s*$//;
            $mir_group{$member} = $arr[0];
        }
    }
    $mir_cluster_file->close;

    # store fasta/rnahy output details into hash
    my ($delimiter, $records, $fasta_rnahy_file, $final_out);
    my (%mimics_species, %info, %hash_details);
    
    if ($program_option == 1) {
        $fasta_rnahy_file = "$output_folder/fasta_out_details";
        $final_out = "$output_folder/fastaout_nc_conserv";
    } elsif ($program_option == 2) {
        $fasta_rnahy_file = "$output_folder/rnahy_out_details";
        $final_out = "$output_folder/rnahyout_nc_details";
    }
    
    unless (-e $fasta_rnahy_file) {
        STDERR->say ("\nNo hits found.\n", "-"x6, " TarHunter analysis completed.");
        exit;
    }
    
    $tab_format ? ($delimiter = "\t") : ($delimiter = "\n");
    my $details_outfile = IO::File->new($fasta_rnahy_file, 'r') or die $!;
    while (<$details_outfile>) {
        s/\r//g;
        chomp;
        next if /^\s*$/;
        my @arr = split/$delimiter/, $_;
        my $ID = $arr[0] =~ s/=//gr;
        my $geneID = $1 if $arr[1] =~ /^(\S+).*/;
        my $mirID = $1 if $arr[3] =~ /^(\S+).*/;
        my $start_pos = $1 if $arr[9] =~ /.*?(\d+)$/;
    
        $mimics_species{$1} = 1 if $mirID =~ /^(\w+)-miR/;
    
        $info{$ID} = [$mirID, $geneID, $start_pos];
    
        my @arr2 = (split/$delimiter/, $_, 2) or next;
        $hash_details{$ID} = $arr2[1];
    }
    $details_outfile->close;
    local $/ = "\n";

    # get all sequences from mimics result
    my %mimics_all_seq;
    for my $species (keys %mimics_species) {
        my $seq_href = read_seq_file("$dbs_folder/$species.fa");
        @mimics_all_seq{keys %{$seq_href}} = values %{$seq_href};
    }

    # extract upstream and downstream 50-bp (default) seqs
    for my $ID (sort keys %info) {
        my $mirID = $info{$ID}->[0];
        my $geneID = $info{$ID}->[1];
        my $pos = $info{$ID}->[2];
 
        my $seq = $mimics_all_seq{$geneID};
        my $upstream = ($pos > int($nc_len/2)) ? ($pos - int($nc_len/2)) : 1;
        my $newseq = substr($seq, $upstream - 1, $nc_len);

        my $species = $1 if $mirID =~ /^(\w+)-miR/;

        open my $nc_seq, '>>', "$temp_dir/mimics/$species.fa";
        print $nc_seq ">$ID\|$mirID\|$geneID\n$newseq\n";
        close $nc_seq;
    }

    # run usearch
    my %checked;
    my $db_list = IO::File->new ("$temp_dir/mimics/db_list.txt", 'w') or die $!;
    for my $species1 (keys %mimics_species) {
        my $db1 = "$temp_dir/mimics/$species1.fa";
        next unless (-e $db1);
        system "$usearch_version -makeudb_usearch $db1 -output $temp_dir/mimics/$species1.udb >>$temp_dir/msg.txt 2>&1";
            
        for my $species2 (keys %mimics_species) {
            next if $species1 eq $species2;
            next if exists $checked{"$species2$species1"};
            my $db2 = "$temp_dir/mimics/$species2.fa";
            next unless (-e $db2);
            $checked{"$species1$species2"} = 1;
            
            $db_list->print ("$db2\t$temp_dir/mimics/$species1.udb\t$temp_dir/mimics/$species2\_$species1\_result\n");
        }
    }
    $db_list->close;
    
    system "parallel --no-notice --jobs $jobs --colsep '\t' -a $temp_dir/mimics/db_list.txt $usearch_version -usearch_global {1} -db {2} -strand plus -threads $threads -id $iden_cutoff -blast6out {3} >>$temp_dir/msg.txt 2>&1";

    my @result_files = glob "$temp_dir/mimics/*result";
    STDERR->say ("\n", "-"x6, " TarHunter analysis completed.") and exit if @result_files < 1;
    system "cut -f1,2 $temp_dir/mimics/*result | sort -u > $temp_dir/mimics/ublast_pairs";


    # parsing usearch result
    my %pair;
    my $ublast_pair_file = IO::File->new("$temp_dir/mimics/ublast_pairs", 'r');
    while (<$ublast_pair_file>) {
        chomp;
        my @line = split;
        my @id1 = split/\|/, $line[0];
        my @id2 = split/\|/, $line[1];

        if ( $mir_group{$id1[1]} eq $mir_group{$id2[1]} ) {
            $pair{$id1[0]}{$id2[0]} = 1;
            $pair{$id2[0]}{$id1[0]} = 1;
        }
    }
    $ublast_pair_file->close;

    my ($elements_href, $group_num, $group_href);
    while ( my ($memb, $v) = each %pair ) {
        my @elems = keys %{ recursive_grouping(\%pair, $memb, $elements_href) };
        $group_num ++;

        delete @pair{@elems};
        for (@elems) {
            $group_href->{$group_num}{$_} = 1;
        }
    }

    my $output = IO::File->new ($final_out, 'w');
    for my $groupID (keys %{$group_href}) {
        print $output ("=="x40, "\n") unless $tab_format;
        my $seqs = IO::File->new ("$temp_dir/mimics/seqs.fa", 'w');
        
        for my $ID ( keys %{$group_href->{$groupID}} ) {
            print $output "$hash_details{$ID}\n";
            unless ($tab_format) {
                my $gene = $info{$ID}->[1];
                my $pos = $info{$ID}->[2];
                my $seq = $mimics_all_seq{$gene};
                my $upstream = ( $pos > int($nc_len/2) ) ? ($pos - int($nc_len/2)) : 1;
                my $seq_100bp = substr($seq, $upstream - 1, $nc_len);
                print $seqs ">$gene\n$seq_100bp\n"; 
            }
        }
        
        unless ($tab_format) {
            open my $seqs_aln, "$muscle_version -in $temp_dir/mimics/seqs.fa -clwstrict -quiet |" or die $!;
            print $output "Alignment of target sites:\n";
            while (<$seqs_aln>) {
                chomp;
                next if /^CLUSTAL/;
                print $output "$_\n";
            }
        }
    }
}


sub recursive_grouping {
    my ($gene_pair, $gene, $elements_href) = @_;
    
    if (exists $elements_href->{$gene}) {
        return;
    } else {
        $elements_href->{$gene} = 1;
    }
    
    for my $k (keys %{$gene_pair->{$gene}}) {
        recursive_grouping($gene_pair, $k, $elements_href);
    }
    return $elements_href;
}
__END__
Home | Browse | Search | Download | Guide | Contact