#!/usr/bin/perl
#########################################################################
#
# TarHunter
#
# Version 2.1
#
# This program is designed for predicting conserved miRNA target and
# target mimics in plants.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# Author: Xuan Ma (skyxma@tjnu.edu.cn)
#
#########################################################################
use strict;
use warnings;
use 5.010;
use Getopt::Long qw/:config no_ignore_case/;
use Cwd 'abs_path';
use File::Temp qw/tempfile tempdir/;
use File::Basename;
use IO::File;
use List::Util qw/max min/;
no warnings 'recursion';
my $TarHunter_version = "2.1";
my $fasta_version = "fasta36";
my $query_mir_list_file = '';
my $species_list_file = '';
my $dbs_folder = '';
my $output_folder = '';
my $program_option = 1; #default: FASTA
my $mispair_cutoff = 100; #set loose cutoff
my $seed_mispair_cutoff = 100; #set loose cutoff
my $mimics_arm_misp_cutoff = 0; #default: no misp in eTM arms
my $bits_cutoff = 0.1; #default: 10% of best bitscore
my $nc_len = 100; #default: homo_mode nc/CDS 100 bp
my $iden_cutoff = 0.7; #default: homo_mode identity 70%
my $jobs = 1; #default: 1 job
my $threads = 1; #default: 1 CPU
my $nonmirbase_file;
my $MUSCLE_alignment_file;
my $score_cutoff;
my $no_orth_mir_search;
my $no_conservation;
my $nc_target;
my $mimics;
my $tab_format;
my $help;
GetOptions (
"n|non_mirbase=s" => \$nonmirbase_file,
"q|qmir=s" => \$query_mir_list_file,
"s|spe=s" => \$species_list_file,
"b|db=s" => \$dbs_folder,
"p|prog=i" => \$program_option,
"a|aln=s" => \$MUSCLE_alignment_file,
"M|total_misp=i" => \$mispair_cutoff,
"m|seed_misp=i" => \$seed_mispair_cutoff,
"f|score=f" => \$score_cutoff,
"c|iden=f" => \$iden_cutoff,
"N|no_conserve" => \$no_conservation,
"R|no_ortho_mir" => \$no_orth_mir_search,
"G|nc_targ" => \$nc_target,
"I|mimics" => \$mimics,
"i|mimics_str=i" => \$mimics_arm_misp_cutoff,
"L|len=i" => \$nc_len,
"j|jobs=i" => \$jobs,
"T|threads=i" => \$threads,
"t|tab" => \$tab_format,
"o|output=s" => \$output_folder,
"h|help" => \$help
);
$score_cutoff = 100 if $mimics; #eTM: set loose score cutoff
$score_cutoff = 4 if (! $mimics and ! $score_cutoff); #default score: 4
STDERR->say ("\n", "=="x40);
STDERR->say ("TarHunter(version $TarHunter_version) Plant miRNA target searching tool");
STDERR->say ("=="x40);
my $time = scalar localtime;
STDERR->say ("$time");
my @cmdline = ();
my $buffer;
if (open my $cmd_fh, "<:raw", "/proc/$$/cmdline") {
read($cmd_fh, $buffer, 8388608);
close $cmd_fh;
@cmdline = split /\0/s, $buffer;
}
STDERR->say ("Command Line: ", join(" ", @cmdline), "\n");
STDERR->say ("-"x6, " Checking arguments and tools ...");
#check arguments
check_arguments();
#check if output folder already exists
if (-e $output_folder) {
STDERR->say ("Output folder already exists. TarHunter exits.");
exit;
}
#get TarHunter directory path
my $program_path = abs_path($0);
my $dir = dirname($program_path);
#create temporary folder
system "mkdir $dir/temp" unless -e "$dir/temp";
my $temp_dir = tempdir('TarHunter_job_XXXXXXXXX', DIR => "$dir/temp");
STDERR->say ("Creating temporary folder ($temp_dir).");
system "mkdir $output_folder $temp_dir/cdhit $temp_dir/ortho $temp_dir/cdsnew $temp_dir/pr_db $temp_dir/mimics";
my $usearch_version = "$dir/bin/usearch8.1.1756_i86linux32";
my $muscle_version = "$dir/bin/muscle3.8.31_i86linux64";
system "chmod a+x $usearch_version" unless -x $usearch_version;
system "chmod a+x $muscle_version" unless -x $muscle_version;
#check tools USEARCH, MUSCLE, MCL, FASTA and RNAhybrid
check_tools();
#################### Reading miRNAs ####################
#read mir list and species list
STDERR->say ("\n", "-"x6, " Reading miRNA file ...");
if ($nonmirbase_file) {
system "cat $nonmirbase_file $dir/miRs/miRBase.fa > $temp_dir/cdhit/miRBase.fa";
}
my $mir_list_href = read_list($query_mir_list_file);
my $species_list_href = read_list($species_list_file);
my $num_of_species = scalar (keys %{ $species_list_href});
#produce new miR list and new miR cluster file
if ($nonmirbase_file) {
system "cd-hit-est -i $temp_dir/cdhit/miRBase.fa -o $temp_dir/cdhit/out -c 0.85 -n 5 -d 40 -g 1 >>$temp_dir/msg.txt 2>&1";
cdhit_parser();
}
my %mir_newlist = %{$mir_list_href};
orth_mir() unless $num_of_species == 1;
#read mirBase sequence file
my $miRBase;
if ($nonmirbase_file) {$miRBase = "$temp_dir/cdhit/miRBase.fa";}
else {$miRBase = "$dir/miRs/miRBase.fa";}
my $mir_seq_href = read_seq_file($miRBase);
#get sequences of new miR list
my %mir_newseq;
for my $id (keys %{$mir_seq_href}) {
if (exists $mir_newlist{$id}) {
$mir_newseq{$id} = $mir_seq_href->{$id};
}
}
if (! %mir_newseq) {
STDERR->say ( "No miRBase miRNAs. TarHunter exits." );
exit;
}
undef $mir_seq_href;
#################### Orthologous gene clustering ####################
my ($MUSCLE_aln_href, $all_proteomes_href);
unless (defined $no_conservation or $num_of_species == 1 or
defined $mimics or defined $nc_target or defined $MUSCLE_alignment_file) {
STDERR->say ("\n", "-"x6, " Orthologous gene clustering ...");
#convert cds dbs to protein dbs
$all_proteomes_href = generate_pr_dbs();
#run UBLAST, MCL, MUSCLE
my $gene_group_href;
$gene_group_href = run_UBLAST_MCL();
run_MUSCLE($gene_group_href);
}
#################### miRNA target search ####################
if ($program_option == 0) {
STDERR->say ("\n", "-"x6, " TarHunter analysis completed.");
exit;
}
STDERR->say ("\n", "-"x6, " miRNA target search ...");
#read muscle alignment file
if ( defined $MUSCLE_alignment_file ) {
$MUSCLE_aln_href = read_seq_file($MUSCLE_alignment_file);
}
#run FASTA and RNAhybrid for each miR
my ($rnahy_output_ID, $fasta_output_ID) = (0, 0);
for my $query_mir (sort keys %mir_newseq) {
next unless $query_mir =~ /^(...)\-/;
my $cds_file;
my @db_files = glob "$dbs_folder/*.fa";
next unless ($cds_file) = grep { /$1\.fa/ } @db_files;
$cds_file = basename $cds_file;
my $mir_tmpfile = IO::File->new("$temp_dir/mir.fa.tmp", 'w') or die $!;
$mir_tmpfile->say (">$query_mir\n$mir_newseq{$query_mir}");
$mir_tmpfile->close;
if ($program_option == 1) {
run_FASTA($query_mir, "$temp_dir/mir.fa.tmp", "$dbs_folder/$cds_file");
} elsif ($program_option == 2) {
unless ( -e "$temp_dir/cdsnew/$cds_file\.cdsnew" ) {
#read cds db
my $cds_file_href = read_seq_file("$dbs_folder/$cds_file");
#creat new cds db for RNAhybrid
STDERR->say ("Producing new $cds_file file for RNAhybrid...");
my $cdsnew_href = cds_new($cds_file_href);
my $cdsnew_file = IO::File->new("$temp_dir/cdsnew/$cds_file\.cdsnew", 'w') or die $!;
foreach (keys %{$cdsnew_href}) {
$cdsnew_file->say (">$_\n$cdsnew_href->{$_}");
}
}
run_RNAhybrid($query_mir, "$temp_dir/mir.fa.tmp", "$temp_dir/cdsnew/$cds_file\.cdsnew");
}
}
#################### Conservation filter ####################
if ($no_conservation or $num_of_species == 1) {
STDERR->say ("\n", "-"x6, " TarHunter analysis completed.");
exit;
}
if ($mimics or $nc_target) { # nc/mimics target conservation
STDERR->say ("\n", "-"x6, " Finding cross-species conserved target sites ...");
mimics_nctarg_conservation();
} else { # CDS target conservation
STDERR->say ("\n", "-"x6, " Finding cross-species conserved CDS target sites ...");
system "sort -k 2,2 $temp_dir/mircluster.tmp > $temp_dir/mircluster.sorted.tmp";
my $program;
if ($program_option == 1) { $program = 'fasta'; }
elsif ($program_option == 2) { $program = 'rnahy'; }
conservation("$temp_dir/$program\_out", "$temp_dir/$program\_conserv_out");
undef $MUSCLE_aln_href;
if ($tab_format) {
system "head -n 1 $temp_dir/$program\_conserv_out >> $output_folder/$program\_out_conserv";
system "tail -n +2 $temp_dir/$program\_conserv_out | sort -u >> $output_folder/$program\_out_conserv";
} else {
remove_duplicate("$temp_dir/$program\_conserv_out", "$output_folder/$program\_out_conserv");
}
}
system "rm -r $temp_dir/cdhit $temp_dir/ortho $temp_dir/cdsnew $temp_dir/pr_db $temp_dir/mimics";
system "rm $temp_dir/*.tmp";
system "rm $temp_dir/*_out" if -e "$temp_dir/*_out";
STDERR->say ("\n", "-"x6, " TarHunter analysis completed.\n");
#################### Subroutines ####################
sub check_arguments {
unless ($query_mir_list_file) {
STDERR->say ("FATAL: query miR list file does not exist.");
usage();
}
unless ($dbs_folder) {
STDERR->say ("FATAL: dbs folder is not provided.");
usage();
}
unless ($species_list_file) {
STDERR->say ("FATAL: species list file does not exist.");
usage();
}
unless ($output_folder) {
STDERR->say ("FATAL: output folder is not provided.");
usage();
}
if ($MUSCLE_alignment_file and !(-e $MUSCLE_alignment_file)) {
STDERR->say ("FATAL: protein alignment file does not exists.");
usage();
}
if ($nonmirbase_file and !(-e $nonmirbase_file)) {
STDERR->say ("FATAL: non-miRBase miRNA file does not exists.");
usage();
}
if ($help) { usage(); }
}
sub usage {
my $usage = << "USAGE";
Usage:
perl TarHunter.pl -q mir_list -s spe_list -b dbs_dir -o output_dir [Options]
Required arguments:
-q (--qmir): miRNA list
-s (--spe): species list
-b (--db): dbs directory
-o (--output): output directory
Options:
-p (--prog): programs
(1:FASTA, 2:RNAhybrid, 0:neither) [Default: 1 ]
-a (--aln): protein alignment file [Default: off]
-n (--nonmirbase): non-miRBase miRNA file [Default: off]
-M (--total_misp): max. total mispairs [Default: off]
-m (--seed_misp): max. seed mispairs [Default: off]
-f (--score): score cutoff [Default: 4 ]
-G (--nc_targ): homo_mode conservation filter [Default: off]
-c (--iden): homo_mode target site identity [Default: 0.7]
-L (--len): homo_mode target site length [Default: 100]
-I (--mimics): eTM search [Default: off]
-i (--mimics_str): eTM stringency
(0: strict, 1: relaxed) [Default: 0 ]
-N (--no_conserve): no ortho_mode conservaton filter [Default: off]
-R (--no_ortho_mir): no orthologous miRNA search [Default: off]
-j (--jobs): jobs [Default: 1 ]
-T (--threads): threads [Default: 1 ]
-t (--tab): tabular format output [Default: off]
-h (--help): help information
USAGE
print $usage;
exit;
}
sub check_tools {
my $CDHIT_command = qx(which cd-hit-est 2>>$temp_dir/msg.txt);
unless($CDHIT_command) {
STDERR->say ("Warning: cd-hit-est not found.");
}
my $MCL_command = qx(which mcl 2>>$temp_dir/msg.txt);
unless($MCL_command ) {
STDERR->say ("Warning: MCL not found.");
}
my $PARALLEL_command = qx(which parallel 2>>$temp_dir/msg.txt);
unless($PARALLEL_command ) {
STDERR->say ("Warning: GNU parallel not found.");
}
my $RNAhybrid_command = qx(which RNAhybrid 2>>$temp_dir/msg.txt);
unless ($RNAhybrid_command) {
STDERR->say ("Warning: RNAhybrid not found.");
}
my $FASTA_command = qx(which $fasta_version 2>>$temp_dir/msg.txt);
unless ($FASTA_command) {
STDERR->say ("Warning: $fasta_version not found.");
}
}
sub read_list {
my $file = shift;
my %list;
my $infile = IO::File->new( $file, 'r' ) or die $!;
while (<$infile>) {
chomp;
s/\r//;
s/^\s*|\s*$//;
next if /^\s*$/;
$list{$_} = 1;
}
return \%list;
}
sub read_seq_file {
my ($seqfile) = @_;
my %seqhash;
local $/ = '>';
my $infile = IO::File->new($seqfile, 'r') or die $!;
while (<$infile>) {
s/\r//g;
chomp;
my ($id, $seq) = /(\S+)(?:.*?)\n(.+)/s or next;
($seq = uc $seq) =~ s/\n//g;
$seqhash{$id} = $seq;
}
return \%seqhash;
}
sub orth_mir {
my $mir_group_href;
if ($nonmirbase_file) { $mir_group_href = ortho_group("$temp_dir/cdhit/mir_cluster.txt") }
else { $mir_group_href = ortho_group("$dir/miRs/mir_cluster.txt") }
#output mircluster.tmp file for conservation analysis
my $mircluster_tmpfile = IO::File->new("$temp_dir/mircluster.tmp", 'w') or die $!;
my %all_spe;
MIR:for my $mir (keys %{$mir_list_href}) {
my %passed_spe;
next unless $mir =~ /^(...)-miR/;
$passed_spe{$1} = 1;
$all_spe{$1} = 1;
#find group ID of query mir
for my $group (keys %{$mir_group_href}) {
next unless exists $mir_group_href->{$group}{$mir};
$mircluster_tmpfile->say ("$group\t$mir");
#without orthologous mir search, only output mircluster.tmp file
next MIR if defined $no_orth_mir_search;
#find the most orthologous mir from the file mir_cluster.txt
local $/ = '#';
my $mircluster_file;
if ($nonmirbase_file) { $mircluster_file = IO::File->new("$temp_dir/cdhit/mir_cluster.txt", 'r') or die $! }
else { $mircluster_file = IO::File->new("$dir/miRs/mir_cluster.txt", 'r') or die $! }
my $omitted = <$mircluster_file>;
while (<$mircluster_file>) {
chomp;
s/\r//g;
next unless /^$group\n/;
my @mir_arr = split/\n/, $_;
for my $mir_member (@mir_arr) {
next unless $mir_member =~ /^(...)-miR/;
next if exists $passed_spe{$1};
next unless exists $species_list_href->{$1};
$mir_newlist{$mir_member} = 1;
$mircluster_tmpfile->say ("$group\t$mir_member");
$passed_spe{$1} = 1;
}
}
}
}
if (! %all_spe) { STDERR->say("No miRBase miRNAs found. TarHunter exits."); exit; }
$num_of_species == 1 if scalar keys %all_spe == 1;
}
sub generate_pr_dbs {
my %all_proteomes;
for my $species (keys %{$species_list_href}) {
my $cds_file;
my %proteome;
my @db_files = glob "$dbs_folder/*.fa";
next unless ($cds_file) = grep { /$species\.fa/ } @db_files;
(my $pr_file = basename $cds_file) =~ s/\.fa/\.pr\.fa/;
#read cds db
my $cds_href = read_seq_file($cds_file);
#translate cds db
my $pr_outfile = IO::File->new("$temp_dir/pr_db/$pr_file", 'w') or die $!;
for my $id (keys %{$cds_href}) {
$proteome{$id} = translation($cds_href->{$id});
$pr_outfile->say (">$id\n$proteome{$id}");
}
$pr_outfile->close;
#merge all pr dbs
STDERR->say ("Converting $species cds dbs to protein dbs ...");
@all_proteomes{keys %proteome} = values %proteome;
}
return \%all_proteomes;
}
sub cdhit_parser {
my ($k, $m, $v);
my %h;
open my $in_fh, '<', "$temp_dir/cdhit/out.clstr" or die $!;
while(<$in_fh>){
chomp;
if (/>Cluster/) {
s/>/#/; s/\s+//;
$k = $_;
} else {
next unless />(...-miR.+)\.\.\..*(\/.+%|\*)$/;
$m = $1;
$v = $2 =~ s/%|\///gr;
$v = 0 if $v eq '*';
push @{ $h{$k} }, [$m, $v];
}
}
close $in_fh;
open my $out, '>', "$temp_dir/cdhit/mir_cluster.txt" or die $!;
for my $i (sort keys %h) {
print $out "$i\n";
for my $j ( @{ $h{$i} } ) {
next unless $j->[1] == 0;
print $out "$j->[0]\n";
last;
}
for my $l ( sort {$b->[1] <=> $a->[1]} @{ $h{$i} } ) {
next if $l->[1] == 0;
print $out "$l->[0]\n";
}
}
close $out;
}
sub translation {
my $DNA = shift;
$DNA = uc $DNA;
my %genetic_code = (
'ATA' => 'I', 'ATC' => 'I', 'ATT' => 'I', 'ATG' => 'M', 'ACA' => 'T',
'ACC' => 'T', 'ACG' => 'T', 'ACT' => 'T', 'AAC' => 'N', 'AAT' => 'N',
'AAA' => 'K', 'AAG' => 'K', 'AGC' => 'S', 'AGT' => 'S', 'AGA' => 'R',
'AGG' => 'R', 'TCA' => 'S', 'TCC' => 'S', 'TCG' => 'S', 'TCT' => 'S',
'TTC' => 'F', 'TTT' => 'F', 'TTA' => 'L', 'TTG' => 'L', 'TAC' => 'Y',
'TAT' => 'Y', 'TAA' => '*', 'TAG' => '*', 'TGC' => 'C', 'TGT' => 'C',
'TGA' => '*', 'TGG' => 'W', 'CTA' => 'L', 'CTC' => 'L', 'CTG' => 'L',
'CTT' => 'L', 'CCA' => 'P', 'CCC' => 'P', 'CCG' => 'P', 'CCT' => 'P',
'CAC' => 'H', 'CAT' => 'H', 'CAA' => 'Q', 'CAG' => 'Q', 'CGA' => 'R',
'CGC' => 'R', 'CGG' => 'R', 'CGT' => 'R', 'GTA' => 'V', 'GTC' => 'V',
'GTG' => 'V', 'GTT' => 'V', 'GCA' => 'A', 'GCC' => 'A', 'GCG' => 'A',
'GCT' => 'A', 'GAC' => 'D', 'GAT' => 'D', 'GAA' => 'E', 'GAG' => 'E',
'GGA' => 'G', 'GGC' => 'G', 'GGG' => 'G', 'GGT' => 'G',
);
my $protein;
my $i = 0;
while ( $i <= length($DNA) - 3 ) {
my $aa = substr($DNA, $i, 3);
if ( exists $genetic_code{$aa} ) {
$protein .= $genetic_code{$aa};
} else {
$protein .= 'X';
}
$i += 3;
}
$protein = $1 if $protein =~ /(.+?)\*/; # remove AAs behind 1st stop codon
return $protein;
}
sub run_UBLAST_MCL {
STDERR->say ("Running UBLAST ...");
my $db_list = IO::File->new ("$temp_dir/pr_db/db_list.txt", 'w') or die $!;
#make ublast db (reciprocal ublast method)
my @pr_dbs = glob "$temp_dir/pr_db/*.pr.fa";
for my $db1 (@pr_dbs) {
(my $db1_name = basename $db1) =~ s/\.pr\.fa//;
system "$usearch_version -makeudb_ublast $db1 -output $temp_dir/pr_db/$db1_name.udb >>$temp_dir/msg.txt 2>&1";
#query each db
for my $db2 (@pr_dbs) {
next if $db1 eq $db2;
(my $db2_name = basename $db2) =~ s/\.pr\.fa//;
$db_list->print ("$db2\t$temp_dir/pr_db/$db1_name.udb\t$temp_dir/pr_db/$db2_name\_$db1_name\_result\n");
}
}
$db_list->close;
system "parallel --no-notice --jobs $jobs --colsep '\t' -a $temp_dir/pr_db/db_list.txt $usearch_version -ublast {1} -db {2} -threads $threads -evalue 1e-5 -top_hit_only -id 0.5 -blast6out {3} >>$temp_dir/msg.txt 2>&1";
# reciprocal best hits (RBH)
my (%passed, %reciprocal_pairs, %gene_group);
my @res_files = glob "$temp_dir/pr_db/*result";
for my $file1 (@res_files) {
if ($file1 =~ /pr\_db\/(.+?)\_(.+?)\_result/) {
my ($spe1, $spe2) = ($1, $2);
next if exists $passed{"$spe2 $spe1"};
$passed{"$spe1 $spe2"} = 1;
(my $file2) = grep { /pr_db\/$spe2\_$spe1\_result/ } @res_files or next;
my $pair1 = protein_pairs($file1);
my $pair2 = protein_pairs($file2);
for (keys %{$pair1}) {
next unless /(.+)\s(.+)/;
if (exists $pair2->{"$2 $1"}) {
$reciprocal_pairs{"$_"} =
$pair1->{$_} . " " . $pair2->{"$2 $1"}; # get reciprocal hits
}
}
}
}
my (%bits, %best_bits);
for (keys %reciprocal_pairs) {
my ($gene1, $gene2) = split;
my ($eval1, $bits1, $eval2, $bits2) = split /\s/, $reciprocal_pairs{$_};
push @{$bits{$gene1}}, $bits1;
push @{$bits{$gene2}}, $bits2;
}
for (keys %bits) { $best_bits{$_} = max @{$bits{$_}}; } # get best bitscore
for (keys %reciprocal_pairs) {
my ($gene1, $gene2) = split;
my ($eval1, $bits1, $eval2, $bits2) = split /\s/, $reciprocal_pairs{$_};
if ($bits1 / $best_bits{$gene1} < $bits_cutoff or
$bits2 / $best_bits{$gene2} < $bits_cutoff) { # bitscore cutoff
delete $reciprocal_pairs{$_};
}
}
# MCL orthologous gene clustering
STDERR->say ("Running MCL ...");
my $abc = IO::File->new ("$temp_dir/pr_db/ublast.abc", 'w') or die $!;
for (keys %reciprocal_pairs) {
my ($gene1, $gene2) = split;
my ($eval1, $bits1, $eval2, $bits2) = split /\s/, $reciprocal_pairs{$_};
print $abc "$gene1\t$gene2\t$bits1\n$gene2\t$gene1\t$bits2\n";
}
$abc->close;
system "mcxload -abc $temp_dir/pr_db/ublast.abc --stream-mirror -o $temp_dir/pr_db/ublast.mci -write-tab $temp_dir/pr_db/ublast.tab 2>>$temp_dir/msg.txt";
system "mcl $temp_dir/pr_db/ublast.mci -I 1.4 -use-tab $temp_dir/pr_db/ublast.tab -o $temp_dir/pr_db/ublast.out 2>>$temp_dir/msg.txt";
my $group_num;
my $mcl_result = IO::File->new ("$temp_dir/pr_db/ublast.out", 'r') or die $!;
while (<$mcl_result>) {
chomp;
next if /^\s*$/;
my $group = 'Group' . ++$group_num;
my @gene_arr = split;
$gene_group{$group}{$_} = 1 for (@gene_arr);
}
return \%gene_group;
}
sub protein_pairs {
my $infile = shift;
my %result;
my $infile_fh = IO::File->new ($infile, 'r') or die $!;
while (<$infile_fh>) {
my @arr = split;
if ($result{"$arr[0] $arr[1]"}) { # query, subj
my ($new_evalue, $new_bitscore) = split /\s/, $result{"$arr[0] $arr[1]"};
next if ($arr[-2] > $new_evalue or $arr[-1] < $new_bitscore);
}
$result{"$arr[0] $arr[1]"} = "$arr[-2] $arr[-1]";
}
return \%result;
}
sub ortho_group {
my $file = shift;
my %ortho;
my $group;
my $infile = IO::File->new( $file, 'r' ) or die $!;
while(<$infile>) {
next if /^\s+$/;
chomp;
s/\r//;
if (/^#(\w+)/) {
$group = $1;
next;
}
$ortho{$group}{$_} ++;
}
return \%ortho;
}
sub run_MUSCLE {
my $gene_group_href = shift;
#generate individual orthologous groups;
STDERR->say ("Generating othologous protein groups ...");
for my $groupID (keys %{$gene_group_href}) {
for my $member ( keys %{$gene_group_href->{$groupID}} ) {
next unless exists $all_proteomes_href->{$member};
if ($all_proteomes_href->{$member} =~ /\*/) { #omit prs with stop-codons
STDERR->say ("$member translated pr (+1 frame) contains stop codon, omitted.");
next;
}
open my $group_file, '>>', "$temp_dir/ortho/$groupID.fa";
print $group_file ">$groupID\|$member\n", $all_proteomes_href->{$member}, "\n";
close $group_file;
}
}
undef $all_proteomes_href;
#running MUSCLE
STDERR->say ("Running MUSCLE ...");
system "find $temp_dir/ortho/ -name '*.fa' | parallel --no-notice --jobs $jobs $muscle_version -quiet -in {} -out {.}.afa";
# store MUSCLE results into hash and output to ortho_aln file
my $AFA_file = IO::File->new("$output_folder/ortho_aln_MUSCLE.afa", 'w') or die $!;
my $MUSCLE_file = IO::File->new("find $temp_dir/ortho/ -name '*.afa' | xargs cat |") or die $!;
my $id = '';
my $seq = '';
while (<$MUSCLE_file>) {
chomp;
if (/^>(\S+)/) {
if ($id ne '') {
$MUSCLE_aln_href->{$id} = $seq;
$AFA_file->say (">$id\n$seq");
}
$id = $1;
$seq = '';
} else {
$seq .= $_;
}
}
$MUSCLE_aln_href->{$id} = $seq;
$AFA_file->say (">$id\n$seq");
STDERR->say ("MUSCLE completed.");
}
sub cds_new { #cds are split into 2kb segments, as RNAhyb corrupts with long seqs
my $cds_file_href = shift;
my %cdsnew;
while ( my ($id, $seq) = each %{$cds_file_href} ) {
my $line_num = 2 * int(length($seq)/2000);
my $new_line_num;
if (length($seq) <= 2000) {
$new_line_num = 1;
} elsif (length($seq)%2000 < 15) {
$new_line_num = $line_num;
} else {
$new_line_num = $line_num + 1;
}
for my $i (1..$new_line_num) {
if ($i%2) {
my $start = 1000 * ($i - $i%2) + 1;
my $newseq = substr($seq, $start-1, 2000);
$cdsnew{"$id\_$start"} = $newseq;
} else {
my $start = 1000 * ($i - $i%2) - 14;
my $newseq = substr($seq, $start-1, 30);
$cdsnew{"$id\_$start"} = $newseq;
}
}
}
return \%cdsnew;
}
sub run_RNAhybrid {
my ($mirID, $mirfile, $cdsnewfile) = @_;
STDERR->print ("Running RNAhybrid for $mirID ... ");
open my $run_rnahyb, "RNAhybrid -d theta -q $mirfile -t $cdsnewfile -u $mispair_cutoff -v $mispair_cutoff -c |" or die $!;
while (<$run_rnahyb>) {
$rnahy_output_ID ++;
chomp;
RNAhybrid_parser($_);
}
STDERR->say ("done.");
}
sub RNAhybrid_parser {
my $line = shift;
my @line_arr = split/:/,$line or return -1;
my ($field7, $field8, $field9, $field10) = @line_arr[7, 8, 9, 10];
return -1 unless ($field7 and $field8 and $field9 and $field10);
my $field7_copy = $field7 =~ s/\s+//gr;
my $field10_copy = $field10 =~ s/\s+//gr;
if (length($field7_copy) - 2 > $mispair_cutoff or length($field10_copy) > $mispair_cutoff) {
return -1;
}
my @f7 = split//, reverse $field7;
my @f8 = split//, reverse $field8;
my @f9 = split//, reverse $field9;
my @f10 = split//, reverse $field10;
#find mir 5' end
my ($mir_beg, $mir_end);
for my $i (0..$#f7) {
next if ($f9[$i]=~/\s/ and $f10[$i]=~/\s/);
$mir_beg = $i;
last;
}
#find mir 3' end
for my $i (-$#f7..0) {
next if ($f9[-$i]=~/\s/ and $f10[-$i]=~/\s/);
$mir_end = -$i;
last;
}
my $cleavage = 'Yes';
my ($tar, $mir, $aln_symb);
my ($pos_count, $score, $g3_g5, $mispair, $seed_mispair, $p8_mispair,
$middle_mispair, $end_mispair, $MFE_ratio) = (0,0,0,0,0,0,0,0,0,0);
for my $i ($mir_beg..$mir_end) {
my $tar_loop_nt = $f7[$i];
my $tar_pair_nt = $f8[$i];
my $mir_pair_nt = $f9[$i];
my $mir_loop_nt = $f10[$i];
if ($tar_loop_nt eq ' ' and $tar_pair_nt eq ' ' and
$mir_pair_nt eq ' ' and $mir_loop_nt ne ' ') {
$mispair ++;
return -1 if $mispair > $mispair_cutoff;
# output alignment symbol
$tar .= '-';
$aln_symb .= ' ';
$mir .= $mir_loop_nt;
$pos_count ++;
#score
if ($pos_count >= 2 and $pos_count <= 12) {
$score += 2;
} else {
$score += 1;
}
return -1 if $score > $score_cutoff;
$g3_g5 = 1 if $pos_count >= 3 and $pos_count <= 5;
#seed mispairs, cleavage
if ($pos_count >= 2 and $pos_count <= 7) {
$seed_mispair ++;
return -1 if $seed_mispair > $seed_mispair_cutoff;
} elsif ($pos_count == 8 and defined $mimics) {
$p8_mispair ++;
} elsif ($pos_count >= 9 and $pos_count <= 11) {
$cleavage = 'No';
$middle_mispair ++ if defined $mimics;
} elsif ($pos_count >= 12 and $pos_count <= 19 and defined $mimics) {
$end_mispair ++;
#mimics arm (positions 2-8, 12-19) misp <= $mimics_arm_misp_cutoff
return -1 if $seed_mispair + $p8_mispair + $end_mispair > $mimics_arm_misp_cutoff;
}
} elsif ($tar_loop_nt ne ' ' and $tar_pair_nt eq ' ' and
$mir_pair_nt eq ' ' and $mir_loop_nt eq ' ') {
$mispair ++;
return -1 if $mispair > $mispair_cutoff;
$tar .= $tar_loop_nt;
$aln_symb .= ' ';
$mir .= '-';
if ($pos_count >= 2 and $pos_count <= 11) { # pos 2-11
$score += 2;
} else {
$score += 1;
}
return -1 if $score > $score_cutoff;
$g3_g5 = 1 if $pos_count >= 3 and $pos_count <= 4; # pos 2-4
if ($pos_count >= 2 and $pos_count <= 7) {
$seed_mispair ++;
return -1 if $seed_mispair > $seed_mispair_cutoff;
} elsif ($pos_count == 8 and defined $mimics) {
$p8_mispair ++;
} elsif ($pos_count >= 9 and $pos_count <= 10) { # pos9-10
$cleavage = 'No';
$middle_mispair ++ if defined $mimics;
} elsif ($pos_count >= 12 and $pos_count <= 19 and defined $mimics) {
$end_mispair ++;
return -1 if $seed_mispair + $p8_mispair + $end_mispair > $mimics_arm_misp_cutoff;
}
} elsif ($tar_loop_nt ne ' ' and $tar_pair_nt eq ' ' and
$mir_pair_nt eq ' ' and $mir_loop_nt ne ' ') {
$mispair += 1;
return -1 if $mispair > $mispair_cutoff;
$tar .= $tar_loop_nt;
$aln_symb .= ' ';
$mir .= $mir_loop_nt;
$pos_count ++;
if ($pos_count >= 2 and $pos_count <= 12) {
$score += 2;
} else {
$score += 1;
}
return -1 if $score > $score_cutoff;
$g3_g5 = 1 if $pos_count >= 3 and $pos_count <= 5;
if ($pos_count >= 2 and $pos_count <= 7) {
$seed_mispair ++;
return -1 if $seed_mispair > $seed_mispair_cutoff;
} elsif ($pos_count == 8 and defined $mimics) {
$p8_mispair ++;
} elsif ($pos_count >= 9 and $pos_count <= 11) {
$cleavage = 'No';
$middle_mispair ++ if defined $mimics;
} elsif ($pos_count >= 12 and $pos_count <= 19 and defined $mimics) {
$end_mispair ++;
return -1 if $seed_mispair + $p8_mispair + $end_mispair > $mimics_arm_misp_cutoff;
}
} elsif ( ($tar_pair_nt eq 'U' and $mir_pair_nt eq 'G') or
($tar_pair_nt eq 'G' and $mir_pair_nt eq 'U') ) {
$mispair += 1;
return -1 if $mispair > $mispair_cutoff;
$tar .= $tar_pair_nt;
$aln_symb .= 'o';
$mir .= $mir_pair_nt;
$pos_count ++;
if ($pos_count >= 2 and $pos_count <= 12) {
$score += 1;
} else {
$score += 0.5;
}
return -1 if $score > $score_cutoff;
$g3_g5 = 1 if $pos_count >= 3 and $pos_count <= 5;
if ($pos_count >= 2 and $pos_count <= 7) {
$seed_mispair ++;
return -1 if $seed_mispair > $seed_mispair_cutoff;
} elsif ($pos_count == 8 and defined $mimics) {
$p8_mispair ++;
} elsif ($pos_count >= 9 and $pos_count <= 11) {
$cleavage = 'No';
$middle_mispair ++ if defined $mimics;
} elsif ($pos_count >= 12 and $pos_count <= 19 and defined $mimics) {
$end_mispair ++;
return -1 if $seed_mispair + $p8_mispair + $end_mispair > $mimics_arm_misp_cutoff;
}
} else {
$tar .= $tar_pair_nt;
$aln_symb .= '|';
$mir .= $mir_pair_nt;
$pos_count ++;
}
}
#mimics positions 9-11 misp <= 5
if ( defined $mimics and
($cleavage eq 'Yes' or $middle_mispair > 5 or
$seed_mispair + $p8_mispair + $end_mispair > $mimics_arm_misp_cutoff) ) {#reset if needed
return -1;
}
my $tarID = $line_arr[0] =~ s/(.+)\_(\d+)$/$1/r;
my $split_pos = $2;
my $mirID = $line_arr[2];
my $rnahyb_pos = $line_arr[6];
my $rna_start_pos = $split_pos + $rnahyb_pos;
$tar = reverse $tar; #change to 5' 3'
my $mir_r = reverse $mir; #change to 3' 5'
$aln_symb = reverse $aln_symb;
$score ++ if $g3_g5;
return -1 if $score > $score_cutoff;
#output detailed info
open my $rnahyb_details, '>>', "$output_folder/rnahy_out_details" or die $!;
if ($tab_format) {
print $rnahyb_details "\#R$rnahy_output_ID\t";
print $rnahyb_details "$tarID\t$tar\t$mirID\t$mir\t$mispair\t$score\t";
print $rnahyb_details "$seed_mispair\t$cleavage";
print $rnahyb_details "\t$rna_start_pos\n";
} else {
print $rnahyb_details "\#R$rnahy_output_ID", "="x60, "\n";
printf $rnahyb_details "%-35s\t%s\n", $tarID, "5' $tar 3'";
printf $rnahyb_details "%-35s\t%s\n", "", " $aln_symb";
printf $rnahyb_details "%-35s\t%s\n\n", $mirID, "3' $mir_r 5'";
printf $rnahyb_details "%-35s\t%s\n", "Total mispairs:", $mispair;
printf $rnahyb_details "%-35s\t%s\n", "Score:", $score;
printf $rnahyb_details "%-35s\t%s\n", "Seed mismpairs:", $seed_mispair;
printf $rnahyb_details "%-35s\t%s\n", "Cleavage:", $cleavage;
printf $rnahyb_details "%-35s\t%s\n\n", "Position:", $rna_start_pos;
}
close $rnahyb_details;
return -1 if defined $mimics;
#pr start-end positions
my $tar_copy = $tar =~ s/[^\w]//gr;
my $rna_end_pos = $rna_start_pos + length($tar_copy) - 1;
my $pr_start_pos = ($rna_start_pos - $rna_start_pos%3) / 3 + 1;
my $pr_end_pos = ($rna_end_pos - $rna_end_pos%3) / 3;
#find algnment positions of pr start-end
my ($aln_pr_start, $aln_pr_end, $group);
for my $k (keys %{$MUSCLE_aln_href}) {
if ($k =~ /(\S+?)\|$tarID/) {
($aln_pr_start, $aln_pr_end) =
aln_find_pos($MUSCLE_aln_href->{$k}, $pr_start_pos, $pr_end_pos);
$group = $1;
#output rnahy_out file for conservation analysis
open my $rnahyb_output, '>>', "$temp_dir/rnahy_out" or die $!;
print $rnahyb_output "$mirID\t$tarID\t$aln_pr_start\t",
"$aln_pr_end\t$group\tR$rnahy_output_ID\n";
close $rnahyb_output;
}
}
}
sub aln_find_pos {
my ($gene_seq, $start, $end) = @_;
my $segment1 = substr($gene_seq, 0, $start) =~ s/-//gr;
my $offset = $start - length($segment1);
my $segment2 = substr($gene_seq, $start);
my ($i, $newpos) = (0,0);
if ($offset > 0) {
while($segment2 =~ /\w/g) {
$i++;
$newpos = pos($segment2);
last if $i == $offset;
}
}
my $newstart = $start + $newpos;
$offset = $end - $start;
my $segment3 = substr($gene_seq, $newstart);
my $j;
while($segment3 =~ /\w/g) {
$j++;
$newpos = pos($segment3);
last if $j == $offset;
}
my $newend = $newstart + $newpos;
return ($newstart, $newend);
}
sub run_FASTA {
my ($mirName, $mirfile, $cdsfile) = @_;
my $mirAbbr = substr($mirName, 0, 5);
my %fasta_hash;
my $find_first = 0;
my ($geneName, $rna_pair_beg_pos, $rna_pair_end_pos, $rna_start_pos, $rna_end_pos,
$mir_end_pos, $mir_rc, $aln, $aln_copy, $tar, $aln_line, $tar_line, $left_pos);
my (@mir, @tar, @aln);
STDERR->print ("Running FASTA for $mirName ... ");
open my $run_fasta, "$fasta_version -A -n -Q -i -U -T $threads -E 100000 $mirfile $cdsfile 1 2>/dev/null |" or die $!;
while (<$run_fasta>) {
$fasta_output_ID ++;
chomp;
unless ($find_first) {
next unless /^>>(\S+)/;
$geneName = $1;
$find_first = 1;
next;
} else {
if (/^>>(\S+)/) {
$geneName = $1;
} elsif (/Smith-Waterman.+(\d+)-\d+:(\d+)-(\d+)/) {
$mir_end_pos = $1;
$rna_pair_beg_pos = $2;
$rna_pair_end_pos = $3;
} elsif (/^($mirAbbr\-\s+)(\S+)/) {
$mir_rc = $2;
$left_pos = length($1);
$aln_line = <$run_fasta>; #aln symbol
if ($aln_line =~ /^(\s+)(\S.*\S)(\s*)$/) {
$aln = $2;
$aln_copy = $aln =~ s/://gr;
}
$tar_line = <$run_fasta>;
$tar = substr($tar_line, $left_pos, length($mir_rc));
$tar =~ s/\s/-/g; #some targs have unpairings at 5' and 3' end
#primary filtering
next if (defined $aln_copy and length($aln_copy) > $mispair_cutoff);
FASTA_parser($mirName, $geneName, $mir_rc, $tar,
$rna_pair_beg_pos, $rna_pair_end_pos);
}
}
}
close $run_fasta;
STDERR->say ("done.");
}
sub FASTA_parser {
my ($mirID, $geneID, $mir_rc, $tar, $rna_start_pos, $rna_end_pos) = @_;
my $mir_rc_copy = $mir_rc =~ s/-//gr;
return -1 if length($mir_rc_copy) < 10; #remove some low-quality results
(my $mir = reverse uc $mir_rc) =~ tr/AUGC/UACG/;
$tar = reverse uc $tar;
my @mir = split //, $mir; #5'-3'
my @tar = split //, $tar; #3'-5'
my $aln_symb;
my $cleavage = 'Yes';
my ($count, $score, $g3g5, $total_misp, $seed_misp, $p8_misp, $middle_misp, $end_misp)
= (0,0,0,0,0,0,0,0);
my %pairing = (
#class1:pairing, 2:G:U, 3:mismatch, 4:bulge miR strand, 5:bulge tar strand
'AU' => 'class1', 'UA' => 'class1', 'GC' => 'class1', 'CG' => 'class1',
'UG' => 'class2', 'GU' => 'class2', 'AG' => 'class3', 'AC' => 'class3',
'UC' => 'class3', 'GA' => 'class3', 'CA' => 'class3', 'CU' => 'class3',
'AA' => 'class3', 'UU' => 'class3', 'GG' => 'class3', 'CC' => 'class3',
'A-' => 'class4', 'U-' => 'class4', 'G-' => 'class4', 'C-' => 'class4',
'-A' => 'class5', '-U' => 'class5', '-G' => 'class5', '-C' => 'class5',
);
for my $i (0..$#mir) {
my $mir_nt = $mir[$i];
my $tar_nt = $tar[$i];
return -1 unless defined $pairing{"$mir_nt$tar_nt"};
if ($pairing{"$mir_nt$tar_nt"} eq 'class1') {
$count ++;
$aln_symb .= '|';
} elsif ($pairing{"$mir_nt$tar_nt"} eq 'class2') {
$count ++;
$total_misp ++;
return -1 if $total_misp > $mispair_cutoff;
$aln_symb .= 'o';
#Score
if ($count >= 2 and $count <= 12) {
$score += 1;
} else {
$score += 0.5;
}
return -1 if $score > $score_cutoff;
$g3g5 = 1 if $count >= 3 and $count <= 5;
#seed mispairs, cleavage
if ($count >= 2 and $count <= 7) {
$seed_misp ++;
return -1 if $seed_misp > $seed_mispair_cutoff;
} elsif ($count == 8 and defined $mimics) {
$p8_misp ++;
} elsif ($count >= 9 and $count <= 11) {
$cleavage = 'No';
$middle_misp ++ if defined $mimics;
} elsif ($count >= 12 and $count <= 19 and defined $mimics) {
$end_misp ++;
#mimics arm (positions 2-8, 12-19) misp <= $mimics_arm_misp_cutoff
return -1 if $seed_misp + $p8_misp + $end_misp > $mimics_arm_misp_cutoff;
}
} elsif ($pairing{"$mir_nt$tar_nt"} eq 'class3') {
$count ++;
$total_misp ++;
return -1 if $total_misp > $mispair_cutoff;
$aln_symb .= ' ';
if ($count >= 2 and $count <= 12) {
$score += 2;
} else {
$score += 1;
}
return -1 if $score > $score_cutoff;
$g3g5 = 1 if $count >= 3 and $count <= 5;
if ($count >= 2 and $count <= 7) {
$seed_misp ++;
return -1 if $seed_misp > $seed_mispair_cutoff;
} elsif ($count == 8 and defined $mimics) {
$p8_misp ++;
} elsif ($count >= 9 and $count <= 11) {
$cleavage = 'No';
$middle_misp ++ if defined $mimics;
} elsif ($count >= 12 and $count <= 19 and defined $mimics) {
$end_misp ++;
return -1 if $seed_misp + $p8_misp + $end_misp > $mimics_arm_misp_cutoff;
}
} elsif ($pairing{"$mir_nt$tar_nt"} eq 'class4') {
$count ++;
$total_misp ++;
return -1 if $total_misp > $mispair_cutoff;
$aln_symb .= ' ';
if ($count >= 2 and $count <= 12) {
$score += 2;
} else {
$score += 1;
}
return -1 if $score > $score_cutoff;
$g3g5 = 1 if $count >= 3 and $count <= 5;
if ($count >= 2 and $count <= 7) {
$seed_misp ++;
return -1 if $seed_misp > $seed_mispair_cutoff;
} elsif ($count == 8 and defined $mimics) {
$p8_misp ++;
} elsif ($count >= 9 and $count <= 11) {
$cleavage = 'No';
$middle_misp ++ if defined $mimics;
} elsif ($count >= 12 and $count <= 19 and defined $mimics) {
$end_misp ++;
return -1 if $seed_misp + $p8_misp + $end_misp > $mimics_arm_misp_cutoff;
}
} elsif ($pairing{"$mir_nt$tar_nt"} eq 'class5') {
$total_misp ++;
return -1 if $total_misp > $mispair_cutoff;
$aln_symb .= ' ';
if ($count >= 2 and $count <= 11) {
$score += 2;
} else {
$score += 1;
}
return -1 if $score > $score_cutoff;
$g3g5 = 1 if $count >= 3 and $count <= 4;
if ($count >= 2 and $count <= 7) {
$seed_misp ++;
return -1 if $seed_misp > $seed_mispair_cutoff;
} elsif ($count == 8 and defined $mimics) {
$p8_misp ++;
} elsif ($count >= 9 and $count <= 10) {
$cleavage = 'No';
$middle_misp ++ if defined $mimics;
} elsif ($count >= 11 and $count <= 19 and defined $mimics) {# pos 12-19
$end_misp ++;
return -1 if $seed_misp + $p8_misp + $end_misp > $mimics_arm_misp_cutoff;
}
}
}
$score ++ if $g3g5;
return -1 if $score > $score_cutoff;
#mimics positions 9-11 misp <= 5
if ( defined $mimics and
($cleavage eq 'Yes' or $middle_misp > 5 or
$seed_misp + $p8_misp + $end_misp > $mimics_arm_misp_cutoff) ) {
return -1;
}
$tar = reverse $tar; #change to 5'-3'
my $mir_r = reverse $mir; #change to 3'-5'
$aln_symb = reverse $aln_symb;
#output detailed info
open my $fasta_details, '>>', "$output_folder/fasta_out_details" or die $!;
if ($tab_format) {
print $fasta_details "\#F$fasta_output_ID\t";
print $fasta_details "$geneID\t$tar\t$mirID\t$mir\t$total_misp\t",
"$score\t$seed_misp\t$cleavage\t$rna_start_pos\n";
} else {
print $fasta_details "\#F$fasta_output_ID", "=="x40, "\n";
printf $fasta_details "%-35s\t%s\n", $geneID, "5' $tar 3'";
printf $fasta_details "%-35s\t%s\n", "", " $aln_symb";
printf $fasta_details "%-35s\t%s\n\n", $mirID, "3' $mir_r 5'";
printf $fasta_details "%-35s\t%s\n", "Total mispairs:", $total_misp;
printf $fasta_details "%-35s\t%s\n", "Score:", $score;
printf $fasta_details "%-35s\t%s\n", "Seed mispairs:", $seed_misp;
printf $fasta_details "%-35s\t%s\n", "Cleavage:", $cleavage;
printf $fasta_details "%-35s\t%s\n\n", "Position:", $rna_start_pos;
}
close $fasta_details;
return -1 if defined $mimics;
my $pr_start_pos = ($rna_start_pos - $rna_start_pos%3) / 3 + 1;
my $pr_end_pos = ($rna_end_pos - $rna_end_pos%3) / 3;
#output fasta_output file for conservation analysis
my ($pr_aln_start, $pr_aln_end, $group);
for my $k (keys %{$MUSCLE_aln_href}) {
if ($k =~ /(\S+?)\|$geneID$/) {
($pr_aln_start, $pr_aln_end) =
aln_find_pos($MUSCLE_aln_href->{$k}, $pr_start_pos, $pr_end_pos);
$group = $1;
open my $fasta_output, '>>', "$temp_dir/fasta_out" or die $!;
print $fasta_output "$mirID\t$geneID\t$pr_aln_start\t",
"$pr_aln_end\t$group\tF$fasta_output_ID\n";
close $fasta_output;
}
}
}
sub conservation {
my ($infile, $outfile) = @_;
unless (-e $infile) {
STDERR->say ("\nNo hits found.\n", "-"x6, " TarHunter analysis completed.");
exit;
}
my $infile2 = basename $infile =~ s/(.+)/$1\_details/r;
system " sort -k 1,1 $infile > $infile\.sorted.tmp ";
system " join -1 1 -2 2 $infile\.sorted.tmp $temp_dir/mircluster.sorted.tmp > $infile\.merged.tmp ";
#merged file format: mirID-geneID-start-end-geneGroup-siteNum-mirCluster
my $result_href = parse_merged_file("$infile\.merged.tmp");
#store detailed info into hash
my %hash_details;
open my $rnahy_fasta_details, "$output_folder/$infile2" or die $!;
local $/ = '#';
my $discarded = <$rnahy_fasta_details>;
while (<$rnahy_fasta_details>) {
chomp;
my ($id, $details);
if ($tab_format) {
($id, $details) = /(.+?)\t(.+)/s or next;
} else {
($id, $details) = /(.+?)\n(.+)/s or next;
}
$id =~ s/=//g;
$hash_details{$id} = $details;
}
close $rnahy_fasta_details;
local $/ = "\n";
#sequenceID as key, speciesID as value
my %hash_species;
for my $species (keys %{$species_list_href}) {
for my $file(glob "$dbs_folder/*.fa") {
next unless $file =~ /$species\.fa/;
my $spe_hash = IO::File->new ($file, 'r') or die $!;
while (<$spe_hash>) {
chomp;
$hash_species{$1} = $species if />(\S+)(.*)/;
}
}
}
if ($infile eq "$temp_dir/rnahy_out") {
STDERR->say ("Outputing RNAhybrid results ...");
} elsif ($infile eq "$temp_dir/fasta_out") {
STDERR->say ("Outputing FASTA results ...");
}
my ($identity_cutoff_pr, $identity_cutoff_nt) = (0.55, 0.65);
my $final_output = IO::File->new($outfile, 'w') or die $!;
if ($tab_format) { #tab format header
print $final_output "miRID\tmiR sequence\ttargetID\ttarget sequence\tTotal mispair\t",
"Score\tSeed mispair\tCleavage\tPosition\tSpecies\tIdentity(nt)\%\tIdentity(aa)\%\n";
}
for my $gene_group (keys %{$result_href}) {
for my $mir_cluster (keys %{$result_href->{$gene_group}}) {
for my $site_num (keys %{$result_href->{$gene_group}{$mir_cluster}}) {
my $inside_group = $result_href->{$gene_group}{$mir_cluster}{$site_num};
#inside a unique group
my (@start_arr, @end_arr, %group_species);
for my $element (keys %{$inside_group}) {
my $group_members = $inside_group->{$element};
my $species = $hash_species{$group_members->[1]}; #$group_member->[1]:geneID
push @start_arr, $group_members->[2];
push @end_arr, $group_members->[3];
$group_species{$species} ++;
}
my $num_of_species = scalar (keys %group_species);
next if $num_of_species < 2; # at least 2 species
# target site aa boundary
my $min = min @start_arr;
my $max = max @end_arr;
my $aa_tar_region;
open my $site_pr_file, '>>', "$temp_dir/tar_site_pr.fa" or die $!;
open my $site_nt_file, '>>', "$temp_dir/tar_site_nt.fa" or die $!;
for my $element (keys %{$inside_group}) {
#output target site aa seqs
my $gene_id = $inside_group->{$element}->[1];
$aa_tar_region = extract_site($min, $max, $gene_id, $gene_group);
$aa_tar_region =~ s/-//g;
print $site_pr_file ">$gene_id\n$aa_tar_region\n";
#output target site nt seqs
my (@details_arr, @tar_nt_arr, $tar_nt, $aln_symb);
if ($tab_format) {
@details_arr = split/\t/, $hash_details{$element};
$tar_nt = $details_arr[1]; #site nt seq 5'-3'
} else {
@details_arr = split/\n/, $hash_details{$element};
@tar_nt_arr = split/\s+/, $details_arr[0];
$tar_nt = $tar_nt_arr[-2]; #site nt seq 5'-3'
}
print $site_nt_file ">$gene_id\n$tar_nt\n";
}
close $site_pr_file;
close $site_nt_file;
my ($aln_pr, $site_identity_pr) = aln_identity("$temp_dir/tar_site_pr.fa");
my ($aln_nt, $site_identity_nt) = aln_identity("$temp_dir/tar_site_nt.fa");
system "rm $temp_dir/tar_site_nt.fa $temp_dir/tar_site_pr.fa";
next if $site_identity_pr <= $identity_cutoff_pr;
next if $site_identity_nt <= $identity_cutoff_nt;
my $rounded_identity_nt = sprintf("%.1f", $site_identity_nt * 100);
my $rounded_identity_pr = sprintf("%.1f", $site_identity_pr * 100);
print $final_output "=="x40, "\n" unless $tab_format;
if ($tab_format) {
for my $element (sort {$inside_group->{$a}->[0] cmp $inside_group->{$b}->[0]}
keys %{$inside_group}) {
chop $hash_details{$element};
print $final_output "$hash_details{$element}\t";
print $final_output "$num_of_species\t";
print $final_output "$rounded_identity_nt\t$rounded_identity_pr\n";
}
} else {
#output details
for my $element (sort {$inside_group->{$a}->[0] cmp $inside_group->{$b}->[0]}
keys %{$inside_group}) {
print $final_output "$hash_details{$element}\n";
}
print $final_output "Conserved sites found in $num_of_species species.\n\n";
#output target site nt alignment
print $final_output "Target site nucleotide acid identity: ",
$rounded_identity_nt, "\%\n";
print $final_output "$aln_nt\n";
#output target site aa alignment
print $final_output "Target site aa motif identity: ",
$rounded_identity_pr, "\%\n";
print $final_output "$aln_pr\n";
}
}
}
}
}
sub aln_identity {
my $file = shift;
my ($identity, $first) = (0,0);
my ($aln_detail, $len, $symb, %duplicate);
open my $site_aln_identity, "$muscle_version -in $file -clwstrict -quiet |" or die $!;
while (<$site_aln_identity>) {
chomp;
next if /^CLUSTAL/;
next if /^\s*$/;
next if exists $duplicate{$_};
$duplicate{$_} = 1;
$aln_detail .= "$_\n"; #target site alignment
unless ($first) {
if (/^((\S+?)\s+)\S+/) {
$len = length $1;
$first = 1;
}
} else {
next unless /\.|\:|\*/;
$symb = substr($_, $len);
my $symb_copy = $symb =~ s/\*//gr;
$identity = 1-length($symb_copy)/length($symb);
}
}
return ($aln_detail, $identity);
}
sub parse_merged_file {
my $file = shift;
my %hash;
my ($first_in_all, $first_in_group, $siteNo, $aa_shift) = (1, 1, 0, 2);
my ($curr, $prev);
open my $merged_file, qq(sort -k 5,5 -k 7,7 -k 3n,3n -k 4n,4n $file |);
while ($curr = <$merged_file>) {
$curr =~ s/\r//g;
chomp $curr;
next if $curr =~ /^\s+$/;
if ($first_in_all) { #first in all
$prev = $curr;
$first_in_all = 0;
next;
}
my @a = split/\s+/, $prev;
my @b = split/\s+/, $curr;
my ($a2, $a3, $a4, $a5, $a6) = @a[2..6];
my ($b2, $b3, $b4, $b5, $b6) = @b[2..6];
if ($a4 eq $b4 and $a6 eq $b6 and abs($a2 - $b2) <= $aa_shift and
abs($a3 - $b3) <= $aa_shift and $first_in_group ) { #first in group
$siteNo ++;
$hash{$a4}{$a6}{$siteNo}{$a5} = [@a[0..3]]; #mirID-geneID-start-end
$hash{$a4}{$a6}{$siteNo}{$b5} = [@b[0..3]];
$first_in_group = 0;
} elsif ($a4 eq $b4 and $a6 eq $b6 and abs($a2 - $b2) <= $aa_shift and
abs($a3 - $b3) <= $aa_shift and $first_in_group == 0) {
$hash{$a4}{$a6}{$siteNo}{$b5} = [@b[0..3]];
} else {
$first_in_group = 1;
}
$prev = $curr;
}
close $merged_file;
return \%hash;
}
sub extract_site {
my ($min, $max, $gene, $group) = @_;
my ($tarsite_region, $region, $reg1, $reg2);
my $seq = $MUSCLE_aln_href->{"$group\|$gene"};
# target site aa motif
$tarsite_region = substr($seq, $min - 1, $max - $min + 1);
return $tarsite_region;
}
sub remove_duplicate {
my ($infile, $outfile) = @_;
my %result;
my $delimiter = "=="x40;
local $/ = "$delimiter";
my $infile_fh = IO::File->new ($infile, 'r') or die $!;
while (<$infile_fh>) {
chomp;
next if /^\s*$/;
$result{$_} = 1;
}
$infile_fh->close;
my $outfile_fh = IO::File->new ($outfile, 'w') or die $!;
for my $group (keys %result) {
print $outfile_fh "$delimiter\n$group";
}
}
sub mimics_nctarg_conservation {
local $/ = '#';
my %mir_group;
my $mir_cluster_file;
if ($nonmirbase_file) { $mir_cluster_file = IO::File->new ("$temp_dir/cdhit/mir_cluster.txt") }
else { $mir_cluster_file = IO::File->new ("$dir/miRs/mir_cluster.txt", 'r') }
my $omitted = <$mir_cluster_file>;
while(<$mir_cluster_file>) {
chomp;
s/\r//g;
my @arr = split/\n/, $_;
for my $member (@arr[1..$#arr]) {
next if $member =~ /^\s*$/;
$member =~ s/^\s*|\s*$//;
$mir_group{$member} = $arr[0];
}
}
$mir_cluster_file->close;
# store fasta/rnahy output details into hash
my ($delimiter, $records, $fasta_rnahy_file, $final_out);
my (%mimics_species, %info, %hash_details);
if ($program_option == 1) {
$fasta_rnahy_file = "$output_folder/fasta_out_details";
$final_out = "$output_folder/fastaout_nc_conserv";
} elsif ($program_option == 2) {
$fasta_rnahy_file = "$output_folder/rnahy_out_details";
$final_out = "$output_folder/rnahyout_nc_details";
}
unless (-e $fasta_rnahy_file) {
STDERR->say ("\nNo hits found.\n", "-"x6, " TarHunter analysis completed.");
exit;
}
$tab_format ? ($delimiter = "\t") : ($delimiter = "\n");
my $details_outfile = IO::File->new($fasta_rnahy_file, 'r') or die $!;
while (<$details_outfile>) {
s/\r//g;
chomp;
next if /^\s*$/;
my @arr = split/$delimiter/, $_;
my $ID = $arr[0] =~ s/=//gr;
my $geneID = $1 if $arr[1] =~ /^(\S+).*/;
my $mirID = $1 if $arr[3] =~ /^(\S+).*/;
my $start_pos = $1 if $arr[9] =~ /.*?(\d+)$/;
$mimics_species{$1} = 1 if $mirID =~ /^(\w+)-miR/;
$info{$ID} = [$mirID, $geneID, $start_pos];
my @arr2 = (split/$delimiter/, $_, 2) or next;
$hash_details{$ID} = $arr2[1];
}
$details_outfile->close;
local $/ = "\n";
# get all sequences from mimics result
my %mimics_all_seq;
for my $species (keys %mimics_species) {
my $seq_href = read_seq_file("$dbs_folder/$species.fa");
@mimics_all_seq{keys %{$seq_href}} = values %{$seq_href};
}
# extract upstream and downstream 50-bp (default) seqs
for my $ID (sort keys %info) {
my $mirID = $info{$ID}->[0];
my $geneID = $info{$ID}->[1];
my $pos = $info{$ID}->[2];
my $seq = $mimics_all_seq{$geneID};
my $upstream = ($pos > int($nc_len/2)) ? ($pos - int($nc_len/2)) : 1;
my $newseq = substr($seq, $upstream - 1, $nc_len);
my $species = $1 if $mirID =~ /^(\w+)-miR/;
open my $nc_seq, '>>', "$temp_dir/mimics/$species.fa";
print $nc_seq ">$ID\|$mirID\|$geneID\n$newseq\n";
close $nc_seq;
}
# run usearch
my %checked;
my $db_list = IO::File->new ("$temp_dir/mimics/db_list.txt", 'w') or die $!;
for my $species1 (keys %mimics_species) {
my $db1 = "$temp_dir/mimics/$species1.fa";
next unless (-e $db1);
system "$usearch_version -makeudb_usearch $db1 -output $temp_dir/mimics/$species1.udb >>$temp_dir/msg.txt 2>&1";
for my $species2 (keys %mimics_species) {
next if $species1 eq $species2;
next if exists $checked{"$species2$species1"};
my $db2 = "$temp_dir/mimics/$species2.fa";
next unless (-e $db2);
$checked{"$species1$species2"} = 1;
$db_list->print ("$db2\t$temp_dir/mimics/$species1.udb\t$temp_dir/mimics/$species2\_$species1\_result\n");
}
}
$db_list->close;
system "parallel --no-notice --jobs $jobs --colsep '\t' -a $temp_dir/mimics/db_list.txt $usearch_version -usearch_global {1} -db {2} -strand plus -threads $threads -id $iden_cutoff -blast6out {3} >>$temp_dir/msg.txt 2>&1";
my @result_files = glob "$temp_dir/mimics/*result";
STDERR->say ("\n", "-"x6, " TarHunter analysis completed.") and exit if @result_files < 1;
system "cut -f1,2 $temp_dir/mimics/*result | sort -u > $temp_dir/mimics/ublast_pairs";
# parsing usearch result
my %pair;
my $ublast_pair_file = IO::File->new("$temp_dir/mimics/ublast_pairs", 'r');
while (<$ublast_pair_file>) {
chomp;
my @line = split;
my @id1 = split/\|/, $line[0];
my @id2 = split/\|/, $line[1];
if ( $mir_group{$id1[1]} eq $mir_group{$id2[1]} ) {
$pair{$id1[0]}{$id2[0]} = 1;
$pair{$id2[0]}{$id1[0]} = 1;
}
}
$ublast_pair_file->close;
my ($elements_href, $group_num, $group_href);
while ( my ($memb, $v) = each %pair ) {
my @elems = keys %{ recursive_grouping(\%pair, $memb, $elements_href) };
$group_num ++;
delete @pair{@elems};
for (@elems) {
$group_href->{$group_num}{$_} = 1;
}
}
my $output = IO::File->new ($final_out, 'w');
for my $groupID (keys %{$group_href}) {
print $output ("=="x40, "\n") unless $tab_format;
my $seqs = IO::File->new ("$temp_dir/mimics/seqs.fa", 'w');
for my $ID ( keys %{$group_href->{$groupID}} ) {
print $output "$hash_details{$ID}\n";
unless ($tab_format) {
my $gene = $info{$ID}->[1];
my $pos = $info{$ID}->[2];
my $seq = $mimics_all_seq{$gene};
my $upstream = ( $pos > int($nc_len/2) ) ? ($pos - int($nc_len/2)) : 1;
my $seq_100bp = substr($seq, $upstream - 1, $nc_len);
print $seqs ">$gene\n$seq_100bp\n";
}
}
unless ($tab_format) {
open my $seqs_aln, "$muscle_version -in $temp_dir/mimics/seqs.fa -clwstrict -quiet |" or die $!;
print $output "Alignment of target sites:\n";
while (<$seqs_aln>) {
chomp;
next if /^CLUSTAL/;
print $output "$_\n";
}
}
}
}
sub recursive_grouping {
my ($gene_pair, $gene, $elements_href) = @_;
if (exists $elements_href->{$gene}) {
return;
} else {
$elements_href->{$gene} = 1;
}
for my $k (keys %{$gene_pair->{$gene}}) {
recursive_grouping($gene_pair, $k, $elements_href);
}
return $elements_href;
}
__END__