package ipr_scan;

# Load libraries
use English;
use LWP;
use Term::ANSIColor;
use XML::Simple;
use XML::XPath;
use XML::XPath::Node::Element;
use Getopt::Long qw(:config no_ignore_case bundling);
use File::Basename;
use Data::Dumper;

# Global parameters and fields
our (%g)=	(	'TCA'=>'S','TCC'=>'S','TCG'=>'S','TCT'=>'S','TTC'=>'F',
			'TTT'=>'F','TTA'=>'L','TTG'=>'L','TAC'=>'Y','TAT'=>'Y',
			'TAA'=>'Z','TAG'=>'Z','TGC'=>'C','TGT'=>'C','TGA'=>'Z',	# Stop codons => TAA, TAG, TGA => Z
			'TGG'=>'W','CTA'=>'L','CTC'=>'L','CTG'=>'L','CTT'=>'L',
			'CCA'=>'P','CCC'=>'P','CCG'=>'P','CCT'=>'P','CAC'=>'H',
			'CAT'=>'H','CAA'=>'Q','CAG'=>'Q','CGA'=>'R','CGC'=>'R',
			'CGG'=>'R','CGT'=>'R','ATA'=>'I','ATC'=>'I','ATT'=>'I',
			'ATG'=>'M','ACA'=>'T','ACC'=>'T','ACG'=>'T','ACT'=>'T',
			'AAC'=>'N','AAT'=>'N','AAA'=>'K','AAG'=>'K','AGC'=>'S',
			'AGT'=>'S','AGA'=>'R','AGG'=>'R','GTA'=>'V','GTC'=>'V',
			'GTG'=>'V','GTT'=>'V','GCA'=>'A','GCC'=>'A','GCG'=>'A',
			'GCT'=>'A','GAC'=>'D','GAT'=>'D','GAA'=>'E','GAG'=>'E',
			'GGA'=>'G','GGC'=>'G','GGG'=>'G','GGT'=>'G',
			'UCA'=>'S','UCC'=>'S','UCG'=>'S','UCU'=>'S','UUC'=>'F',
			'UUU'=>'F','UUA'=>'L','UUG'=>'L','UAC'=>'Y','UAU'=>'Y',
			'UAA'=>'_','UAG'=>'_','UGC'=>'C','UGU'=>'C','UGA'=>'_',
			'UGG'=>'W','CUA'=>'L','CUC'=>'L','CUG'=>'L','CUU'=>'L',
			'CCU'=>'P','CAU'=>'H','CGU'=>'R','AUA'=>'I','AUC'=>'I',
			'AUU'=>'I','AUG'=>'M','ACU'=>'T','AAU'=>'N','AGU'=>'S',
			'GUA'=>'V','GUC'=>'V','GUG'=>'V','GUU'=>'V','GCU'=>'A',
			'GAU'=>'D','GGU'=>'G'
);
our (%np) = (	'A'=>'T','T'=>'A',
		'G'=>'C','C'=>'G',
		'U'=>'A','N'=>'N',
		'R'=>'R','Y'=>'Y',
                'M'=>'M','K'=>'K',
                'S'=>'S','W'=>'W',
		'B'=>'B','D'=>'D',
                'H'=>'H','V'=>'V'
		);
our $baseUrl = 'http://www.ebi.ac.uk/Tools/services/rest/iprscan5';

###################################################################################################
############################### CONTROL FUNCTIONS #################################################
###################################################################################################

#CONTROL 1# - Verify sequence ans return type code - function(sequence)
#Return: 0 unknow/error input, 1 nucleotide sequence, 2 for protein
sub verify_sequence 
{
	#1 Input parameters
	my $Seq = shift;	# Input sequence
	my $delka = 0;		# Length of sequence
	my $count_nt = 0;	# Number of nucleotide SEQ
	my $count_pep = 0;	# Number of AK
	my $nucl;			# Temp nucleotide
	my $i = 0;			# Counter
	
	#2 Modify input
	$Seq =~ tr/\n//d;
	$Seq =~ tr/\r//d;
	
	#4 Procedure - run
	$delka = length($Seq);
	for($i=0;$i<$delka;$i++)
	{
		#A Get value
		$nucl = substr($Seq,$i,1);
		$nucl = uc $nucl;
		#B If value is nucleotide
		if ($nucl eq 'A' || $nucl eq 'T' || $nucl eq 'G' || $nucl eq 'C' || $nucl eq 'U' || $nucl eq 'N') 
		{
			$count_nt++;
		}
		#C If value is protein
		if ($nucl eq 'S' || $nucl eq 'F' || $nucl eq 'L' || $nucl eq 'Y' || $nucl eq 'S' ||
			$nucl eq 'W' || $nucl eq 'P' || $nucl eq 'H' || $nucl eq 'Q' || $nucl eq 'R' ||
			$nucl eq 'I' || $nucl eq 'M' || $nucl eq 'T' || $nucl eq 'N' || $nucl eq 'K' ||
			$nucl eq 'V' || $nucl eq 'D' || $nucl eq 'E' || $nucl eq 'A' || $nucl eq 'T' || 
			$nucl eq 'G' || $nucl eq 'C') 
		{
			$count_pep++;
		}
	}
	
	#5 Return value for NUCL or PEPTIDE
	return 1 if ($count_nt == $delka);
	return 2 if ($count_pep == $delka);
	return 0;
}

###################################################################################################
#################################### FUNCTIONS NUCLEOTIDE CONVERT TO PROTEIN ######################
###################################################################################################

#CONV 1# - Find best ORF for protein searching - function(sequence)
#Return: best_protein_sekvence
sub prot_best 
{
	#1 Set local variables	
	my $sekvence = shift;		# Input query
	my $best_sek = '';		# BEST SEQ
	my $a = 0;			# Counter for loop
	my $best_stop = 100000;		# Number of STOP codons
	my $codon = '';			# Codon for translate SEQ
	my $reverse = '';		# Reverse SEQ
	my %pole = ();			# Field for translate SEQ
	
	#2 Convert input
	$sekvence =~ s/\s//g;
	
	#3 Best SEQ in FORWARD mode
	for ($a=0;$a<3;$a++)
	{
		# Translate sequence
		%pole = &translate_seq($a,$sekvence);
		
		# Extract values
		if (($pole{'stop'} < $best_stop) || (length($pole{'data'}) > length($best_sek))) 
		{
			$best_stop = $pole{'stop'};
			$best_sek = $pole{'data'};
		}
	}
	
	#4 Translate into reverse mode
	for ($a=0;$a<length($sekvence);$a+=1)
	{
		#A Get codon
		$codon = substr($sekvence,$a,1);
		#B Iter reverse SEQ
		$reverse = &reverse_nucleotide($codon) . $reverse;
	}
	
	#5 Best SEQ in REVERSE mode
	for ($a=0;$a<3;$a++)
	{
		# Translate sequence
		%pole = &translate_seq($a,$reverse);
		
		# Extract values
		if (($pole{'stop'} < $best_stop) || (length($pole{'data'}) > length($best_sek))) 
		{
			$best_stop = $pole{'stop'};
			$best_sek = $pole{'data'};
		}
	}
	
	#6 Return value
	return $best_sek;
}

#CONV 2# - Translate sequence into protein - function($a, $seq)
#Return: translated sequence
sub translate_seq 
{
	#1 Define local variables
	my $a = shift;			# Reading frame
	my $seq = shift;		# Sequence
	my $i = 0;			# For loop counter
	my $aminoacid = '';		# Returned aminoacid
	my $temp_string = '';		# TEMP sequence
	my $codon = '';
	my %pole = ();			# Field with translated sequence and price
	
	#2 Inicialization of return field
	$pole{'stop'} = 0;		# Count of STOP codons
	$pole{'data'} = '';		# Result sequence
	
	#3 Translate tricodons
	for($i=$a; $i<(length($seq)-2); $i+=3)
	{
		#A Get substring for codon
		$codon = substr($seq,$i,3);
		#B Get translate codon
		$aminoacid = &codon2aa($codon);
		#C Is ERROR in AA
		if (defined($aminoacid))
		{
			# STOP codon
			if ($aminoacid eq 'Z')
			{
				# Increase counter of STOP
				$pole{'stop'} = $pole{'stop'} + 1;
				# If length is more
				if (length($temp_string) > length($pole{'data'}))
				{
					$pole{'data'} = $temp_string;
					$temp_string = '';	
				}
			}
			# OTHERS CODONS
			elsif (!($aminoacid eq '_'))
			{
				$temp_string = $temp_string . $aminoacid;
			}
		}
	}
	
	#4 If temp string is longer
	if (length($temp_string) > length($pole{'data'}))
	{
		$pole{'data'} = $temp_string;
		$temp_string = '';	
	}
	
	return %pole;
}

#CONV 3# - Translate tricodon to aminoacid - function(codon_string)
#Return: aminokyselina
sub codon2aa 
{
	#1 Input variable
	my $codon = shift;
	
	#2 Convert format
	$codon = uc $codon;

	#3A Get aminoacid
	if(exists $g{$codon})
	{
		return $g{$codon};
	}
	#3B If error give message
	elsif ($codon =~ /[RYKMSWBDHVN]/)
	{
		return "_";
	}
	else
	{
		print STDERR "Bad codon \"$codon\"!!\n";
		return undef;
	}
}

#CONV 4# - Get reverse sequence - function(sequence)
#Return: reverse text string nucleotide
sub reverse_nucleotide 
{
	#1 Define local variable
	my $nuc_in = shift;
	
	#2 Modify input
	$nuc_in = uc $nuc_in;
	
	#3 Give reverse nucleotide or error message
	if (exists $np{$nuc_in})
	{
		return $np{$nuc_in};
	}
	else 
	{
		print STDERR "Unknown nucleotide !!\n";
		return '';
	}
}

###################################################################################################
############################### FUNCTIONS IPR SCAN ################################################
###################################################################################################

#IPR 1# - Create user agent - function()
#Return: user agent
sub create_user_agent() 
{
	#1 Create LWP UserAgent
	my $ua = LWP::UserAgent->new();
	#2 Set HTTP header
	$ua->agent("EBI-Sample-Client/2791 (iprscan5_lwp.pl; $OSNAME) " . $ua->agent ());
	#3 Configure HTTP proxy
	$ua->env_proxy;
	#4 Return UA
	return $ua;
}

#IPR 2# - Function for client waiting - function(job_id)
#Return: job_status
sub client_wait 
{
	#1 Define local variables
	my $job_id = shift;
	my $job_status = 'PENDING';
	my $url = $baseUrl . '/status/' . $job_id;
	my $ERR_Count = 0;
	my $time_PC = 0;

	#2 Control for finish job
	while ($job_status eq 'RUNNING' || $job_status eq 'PENDING'
		|| ($job_status eq 'ERROR' && $ERR_Count < 5))
	{
		#A Get job status
		$job_status = &get_request($url);
		#B If ERROR
		if (!(defined($job_status))) 
		{
			return undef;
		}
		#C1 If ERROR and defined	
		if ($job_status eq 'ERROR') 
		{
			$ERR_Count++;
		}
		#C2 If NOT ERROR
		elsif ($ERR_Count > 0) 
		{
			$ERR_Count--;
		}
		#D Sleep if status
		if ($job_status eq 'RUNNING' || $job_status eq 'PENDING' || $job_status eq 'ERROR')
		{
			sleep 5;
			$time_PC++;
			# Condition for TIME ELAPSED
			if ($time_PC > 600)
			{
				return undef;
			}
		}
	}
	
	#3 If still ERROR get undef
	if ($job_status eq 'ERROR') 
	{
		return undef;
	}
	return $job_status;
}

#IPR 3# - Function for get request (checking status computing proces) - funkce($url)	{revised 15.11.2016}
#Return: request for url query
sub get_request 
{
	#1 Define local variables
	my $url	= shift;
	my $ua = undef;
	my $Acc_encode = '';
	my $response = undef;
	my $ret = undef;

	#2 Create user agent
	eval { $ua = &create_user_agent() unless defined($ua); };
	
	#3 Check Available HTTP compression methods
	eval { $Acc_encode = HTTP::Message::decodable(); };
	$Acc_encode = '' unless defined($Acc_encode);

	#4 Perform request
	$response = $ua->get($url,'Accept-Encoding' => $Acc_encode, );

	#5 Response control if succes	
	if (($response->is_success) && (defined($response))) 
	{
    		if (defined($Acc_encode) && $Acc_encode ne '') 
		{
			$ret = $response->decoded_content();
		}
		$ret = $response->content() unless defined($ret);
		return $ret;	
    	}
    	else 
	{
       		print STDERR color("red"), "IPRSCAN: ", $response->status_line, "\n", color("reset");
		&datab::err_message("IPRSCAN: " . $response->status_line . "\n");
		return undef;
    	}
}

#IPR 4# - Send SEQ as a query to IPR and GET RESULTS - function(sequence)
#Return: name of sequence with done IPR scan
sub send_job_IPR 
{
	#1 Local variables
	my %params = @_;				# Input field
	my %tool_params = ();			# Work field
	my $url = $baseUrl . '/run';	# Create URL adress
	my $ua;							# USER AGENT
	my $response;					# Get response from POST	
	my $job_id = undef;				# Job_id for search query
	my $result = undef;				# Result from IPR
	my $xp = undef;					# XML result format
	my @pole = ();					# Field for mining XML data
	my %entry_params = ();			# Field for entry params
	my @pole_nazvu_node = ('hmmer3-match','panther-match','blastprodom-match','fingerprints-match','hmmer2-match','profilescan-match',
				'patternscan-match','superfamilyhmmer3-match','signalp-match','tmhmm-match','phobius-match','coils-match');
	my @pole_uzlu = ();				# Extract field for nodes
	my $temp_node;					# Temp node for extraction
	my $typ_uzlu = '';				# Temp variable for node type
	my $i = 0;						# Increment variables for loop
	my @temp_field = ();			# Temp field for mining
	my $dat = undef;				# DB for INSERT data

	#2 Define work field
	$tool_params{'sequence'} = $params{'sequence'};
	$tool_params{'goterms'} = 'true';
	$tool_params{'pathways'} = 'true';
	$tool_params{'email'} = $params{'email'};
	
	#3 If mail not provided
	return 0 unless defined($tool_params{'email'});
	
	#4 Create user agent
	eval { $ua = &create_user_agent() unless defined($ua); };

	#5 Send query with POST
	$response = $ua->post($url, \%tool_params);	

	#6 Get job ID for identification
	$job_id = $response->content();
	select( undef, undef, undef, 0.5 );

	#7 CLIENT WAIT ON RESPONSE
	if(!(defined(&client_wait($job_id)))) 
	{
		print STDERR color("red"), "ERROR: Unsuccess search for sequence ", $params{'seq_id'}, "\n", color("reset");
        	&datab::err_message("ERROR: Unsuccess search for sequence " . $params{'seq_id'} . "\n");
		return 0;
	}

	#8 Client get response
	$url = $baseUrl . '/result/' . $job_id . '/xml';
	$result =  &get_request($url);

	#9 Result control
	if (!(defined($result))) 
	{
        	print STDERR color("red"), "ERROR: Unsuccess search for sequence ", $params{'seq_id'}, "\n", color("reset");
        	&datab::err_message("ERROR: Unsuccess search for sequence " . $params{'seq_id'} . "\n");
		return 0;
	}
	
	############## MINING PROCEDURE ###############
	#10 Get data from xml output file
	eval {
		$xp = XML::XPath->new(xml => $result);
	};
	
	#11 Get QUERY-NAME
	@pole = $xp->findnodes('/protein-matches/protein/xref');
	$entry_params{'Query_id'} = $pole[0]->getAttribute('id');	# NAZEV SEKVENCE - Query_id

	#12 Get access to DB
	$dat = undef;
	$dat = &datab::pristup($params{'db_host'}, $params{'db_name'}, $params{'db_user'}, $params{'db_heslo'});
	if (!(defined($dat))) 
	{
		return 0;
	}

	#13 Get rest informations from threads
	foreach $typ_uzlu (@pole_nazvu_node) 
	{
		#A Extract nodes selected type into field
		@pole_uzlu = ();
		@pole_uzlu = $xp->findnodes("/protein-matches/protein/matches/$typ_uzlu");
		$i = 0;
		#B Extract data from nodes selected type 
		foreach $temp_node (@pole_uzlu) 
		{
			#1 Increase counter
			$i++;
			#2 Get SCORE and E-VALUE
			$entry_params{'score'} = $temp_node->getAttribute('score');
			$entry_params{'evalue'} = $temp_node->getAttribute('evalue');
			#3 Integrity control SCORE and E-VALUE
			$entry_params{'score'} = undef unless defined($entry_params{'score'});
			$entry_params{'evalue'} = undef unless defined($entry_params{'evalue'});
			#4 Get LIBRARY-RELEASES - SOURCE LIBRARY
			@temp_field = $xp->findnodes("/protein-matches/protein/matches/$typ_uzlu" . "[$i]/signature/signature-library-release");
			$entry_params{'library'} = $temp_field[0]->getAttribute('library');
			$entry_params{'library'} = '' unless defined($entry_params{'library'});
			@temp_field = ();										
			#5 Get data from SIGNATURE
			@temp_field = $xp->findnodes("/protein-matches/protein/matches/$typ_uzlu" . "[$i]/signature");
			if (defined($temp_field[0])) 
			{
				$entry_params{'sig_name'} = $temp_field[0]->getAttribute('name');			# Name  - signature
				$entry_params{'sig_desc'} = $temp_field[0]->getAttribute('desc');			# Description - signature
				$entry_params{'sig_ac'} = $temp_field[0]->getAttribute('ac');				# Accession code - signature
			}
			else 
			{
				$entry_params{'sig_name'} = "" unless defined($entry_params{'sig_ac'});
				$entry_params{'sig_desc'} = "" unless defined($entry_params{'sig_desc'});
				$entry_params{'sig_ac'} = "" unless defined($entry_params{'sig_ac'});
			}
			#6 Extract from thread SIGNATURE/ENTRY
			@temp_field = ();
			@temp_field = $xp->findnodes("/protein-matches/protein/matches/$typ_uzlu" . "[$i]/signature/entry");
			if (defined($temp_field[0])) 
			{		
				$entry_params{'ent_type'} = $temp_field[0]->getAttribute('type');			# Ent_type
				$entry_params{'ent_name'} = $temp_field[0]->getAttribute('name');			# Ent_name
				$entry_params{'ent_desc'} = $temp_field[0]->getAttribute('desc');			# Ent_desc
				$entry_params{'ent_ac'} = $temp_field[0]->getAttribute('ac');				# Ent_ac => in some cases IPR_scan
			}
			else 
			{
				$entry_params{'ent_type'} = "" unless defined($entry_params{'ent_type'});
				$entry_params{'ent_name'} = "" unless defined($entry_params{'ent_name'});
				$entry_params{'ent_desc'} = "" unless defined($entry_params{'ent_desc'});
				$entry_params{'ent_ac'} = "" unless defined($entry_params{'ent_ac'});
			}
			#7 INSERT IPR RECORD INTO DB
			&datab::add_IPR_entry($dat, %entry_params);

			#8 Extract from subnode type of GO
			@temp_field = ();
			@temp_field = $xp->findnodes("/protein-matches/protein/matches/$typ_uzlu" . "[$i]/signature/entry/go-xref");
			foreach $temp_node (@temp_field) 
			{
				#A Get GO values
				$entry_params{'GO_id'} = $temp_node->getAttribute('id');				# GO_id in format GO:xxxxxxxxx
				$entry_params{'GO_db'} = $temp_node->getAttribute('db');				# GO_db = 'GO'
				#B INSERT GO NUMBERT INTO DB
				&datab::add_GO_struct($dat, %entry_params);
			}
		}
	}
	
	#14 Disconnect DB and return value
	$dat->disconnect;
	return $entry_params{'Query_id'};
}

1;
