Home Index

1  Algorithms

2  Perl NLP

Statistical Natural Language Processing with Perl

package nlp_worker;
use strict;
use warnings;
use Data::Dumper::Simple;
use Number::Format;
use Lingua::EN::Tagger;
use DBI;
#----------------------------- GLOBAL VARIABLES-------------------------------#
my($debug,$dumper)  = (0,0);
my($pause,$tagger,$noun_significance,$noun_run_length,$text_length) = (1,0,0,0,0);
#-----------------------------------------------------------------------------#
sub process_nlp_data
{
  my ($server,$counter,$dataid,$title,$description) = (0,0,'','','');
  
  my ($cpu_stack) = @_;
  my @row        = ();

  while(1)
  {
    my($dbh,$sth,$sql,$counter,$rc);
    $dbh = DBI->connect( 'DBI:mysql:nlp:localhost','username','password',
                         { PrintError => 0, RaiseError => 0 }
                       ) or die "Can't connect to the database: $DBI::errstr\n";
    
    #--- Retrieved data for processing. ---#
    $sql = qq{  
               SELECT server, dataid, description
               FROM msg_queue_$cpu_stack
               ORDER BY dts ASC LIMIT 10;
             };

    if($dumper) { warn Dumper($sql) };
    
    $sth = $dbh->prepare( $sql ) or die "Can't prepare SQL statement: $DBI::errstr\n";
    $sth->execute() or die "Can't execute SQL statement: $DBI::errstr\n";
    
    $sth->bind_col( 1, \$server );
    $sth->bind_col( 2, \$dataid );
    $sth->bind_col( 3, \$description );
    $counter=0;
    
    while ( $sth->fetch ) #--------------------------------------------------#
    {       
       #--- warn Dumper($dataid, $description); ---#
       &lingua_tagger_statistics($title,$description);       
       &insert_nlp_into_msg_queue($server,$dataid,$description);
       $counter++;
    } #----------------------------------------------------------------------#
    
    
    #--- Delete Processed Data. ---#
    $sql = qq{  
               DELETE FROM msg_queue_$cpu_stack
               ORDER BY dts ASC LIMIT $counter;
             };

    if($dumper) { warn Dumper($sql) };
    
    $rc  = $dbh->do( $sql );
    $sth = undef;
    $rc  = $dbh->disconnect();

    if($counter < $pause) { print "Sleep $pause seconds\n"; sleep $pause; }
  }
  
  return "Total processed videos: $counter";
}
#-----------------------------------------------------------------------------#
sub insert_nlp_into_msg_queue
{
  my($dbh, $sth, $sql);
  my($server, $dataid, $description) = @_;
  
  $dbh = DBI->connect( 'DBI:mysql:nlp:localhost','username','password',
                       { PrintError => 0, RaiseError => 0 }
                     ) or die "Can't connect to the database: $DBI::errstr\n";

  $server      = $dbh->quote($server);
  $description = $dbh->quote($description);
  
  $sql = qq{
   INSERT DELAYED INTO msg_queue_nlp(server,dataid,noun_significance,noun_run_length,text_length,dts)
     VALUES($server,'$dataid',$noun_significance,$noun_run_length,$text_length, now())
   ON DUPLICATE KEY UPDATE i=i+1;
           };
        
  if($dumper) { warn Dumper($sql) }; warn Dumper($sql);
  $dbh->do($sql);
  $sth=undef;
  $dbh->disconnect;
}
#------------------------------------------------------------------------------#
#                      lingua_tagger_statistics                                #
#------------------------------------------------------------------------------#
sub lingua_tagger_statistics
{
  my($title, $description) = @_;
  my($dbh,$sth,$sql, $counter);
    
  my $usa = new Number::Format( -thousands_sep   => ','
                              , -decimal_point   => '.'
                              , -int_curr_symbol => 'USA');
  
  $text_length = length($description);
  print "\nDescription Length: " . $text_length . "\n";
  &compute_noun_run_length($description);
  &compute_noun_significance($description);
            
  print $tagger ."|". $noun_significance ."|". $noun_run_length;      

}
#------------------------------------------------------------------------------#
#                      Calculate noun run length
#------------------------------------------------------------------------------#
sub compute_noun_run_length 
{
  my($text,$p,$tagged_text, $term,$speech,$counter,$flag, $tag);
  $noun_run_length = 0;  my $readable_text = "";
  my %hash = ();  my %nlp_max = ();
  
  $text = shift;
  $p    = new Lingua::EN::Tagger;
  $tagged_text = $p->add_tags($text); 
  #-- print $tagged_text . "\n";

  $readable_text = $p->get_readable( $text );
  print "\n" . $readable_text . "\n\n" if $debug;
  
  my @array = split(/ /, $readable_text);
  $flag=0;
  
  foreach(@array)
  {
    ($term,$speech) = split(/\//,$_);
        
    $speech = substr($speech, 0,2);
    if($speech =~ /NN/)
    { 
      $counter++; 
      if($counter > $noun_run_length) { $noun_run_length = $counter; }
      
      print $speech . " " if $debug;
      $flag=1;
    }
    else
    {       
      print $speech . " " if $debug;
      $counter=0;
      $flag=0;
    }
  }
  
  print "\n";
}
#------------------------------------------------------------------------------#
#                  Calculating spam approximity of text
#------------------------------------------------------------------------------#
sub compute_noun_significance
{
  my($text) = @_;  
  my( $p, $tagged_text,$term, $pos, $usa);
  my($percent,$total_count,$total_nouns,$readable_text)= (0,0,0,'');
  
  my %word_list = (); 
  my %hash = (); 

  $p = new Lingua::EN::Tagger;
  $tagged_text = $p->add_tags( $text );

  #--- %word_list = $p->get_words( $text );
  #--- foreach(%word_list) { print $_ . "\n"; }

  $readable_text = $p->get_readable( $text );
  #--- print "\n" . $readable_text . "\n";

  my @array = split(/ /, $readable_text);

  foreach(@array)
  {
    #-- print $_ . "\n";
    ($term,$pos) = split(/\//,$_);
    
    #-- print $term . "\t" . $pos . "\n";
    $hash{$pos} += 1;
  }

  #--- get the total count ---#
  for my $key (keys(%hash)) 
  { 
    $total_count += $hash{$key};
  }
  $tagger = $total_count;
  print "\nTotal=" . $total_count . "\n" if $debug;
  
  #--- print values ---#
  for my $key (keys(%hash)) 
  { 
    $percent   = ($hash{$key} / $total_count) * 100;
    if($key    =~ /(NN)/) { $total_nouns += $percent; }
  }

  $usa = new Number::Format( -thousands_sep   => ','
                           , -decimal_point   => '.'
                           , -int_curr_symbol => 'USA');

  $noun_significance = $usa->format_number($total_nouns);
  print "Significance=" . $noun_significance . "%\n" if $debug;
}
#------------------------------------------------------------------------------#



1;

__END__

"
CC      Conjunction, coordinating               and, or
CD      Adjective, cardinal number              3, fifteen
DET     Determiner                              this, each, some
EX      Pronoun, existential there              there
FW      Foreign words
IN      Preposition / Conjunction               for, of, although, that
JJ      Adjective                               happy, bad
JJR     Adjective, comparative                  happier, worse
JJS     Adjective, superlative                  happiest, worst
LS      Symbol, list item                       A, A.
MD      Verb, modal                             can, could, 'll
NN      Noun                                    aircraft, data
NNP     Noun, proper                            London, Michael
NNPS    Noun, proper, plural                    Australians, Methodists
NNS     Noun, plural                            women, books
PDT     Determiner, prequalifier                quite, all, half
POS     Possessive                              's, '
PRP     Determiner, possessive second           mine, yours
PRPS    Determiner, possessive                  their, your
RB      Adverb                                  often, not, very, here
RBR     Adverb, comparative                     faster
RBS     Adverb, superlative                     fastest
RP      Adverb, particle                        up, off, out
SYM     Symbol                                  *
TO      Preposition                             to
UH      Interjection                            oh, yes, mmm
VB      Verb, infinitive                        take, live
VBD     Verb, past tense                        took, lived
VBG     Verb, gerund                            taking, living
VBN     Verb, past/passive participle           taken, lived
VBP     Verb, base present form                 take, live                                                 
VBZ     Verb, present 3SG -s form               takes, lives                                               
WDT     Determiner, question                    which, whatever                                            
WP      Pronoun, question                       who, whoever                                               
WPS     Determiner, possessive & question       whose                                                      
WRB     Adverb, question                        when, how, however                                         
                                                                                                           
PP      Punctuation, sentence ender             ., !, ?                                                    
PPC     Punctuation, comma                      ,                                                          
PPD     Punctuation, dollar sign                $                                                          
PPL     Punctuation, quotation mark left        ``                                                         
PPR     Punctuation, quotation mark right       ''                                                         
PPS     Punctuation, colon, semicolon, elipsis  :, ..., -                                                  
LRB     Punctuation, left bracket               (, {, [                                                    
RRB     Punctuation, right bracket              ), }, ] 
"


Copyright © 1997-2012 Kevin T. Duraj, All rights reserved
Agoura Hills, California -  Saturday, 19 May 2012