Statistical Natural Language Processing with Perl
package nlp_worker;
use strict;
use warnings;
use Data::Dumper::Simple;
use Number::Format;
use Lingua::EN::Tagger;
use DBI;
#----------------------------- GLOBAL VARIABLES-------------------------------#
my($debug,$dumper) = (0,0);
my($pause,$tagger,$noun_significance,$noun_run_length,$text_length) = (1,0,0,0,0);
#-----------------------------------------------------------------------------#
sub process_nlp_data
{
my ($server,$counter,$dataid,$title,$description) = (0,0,'','','');
my ($cpu_stack) = @_;
my @row = ();
while(1)
{
my($dbh,$sth,$sql,$counter,$rc);
$dbh = DBI->connect( 'DBI:mysql:nlp:localhost','username','password',
{ PrintError => 0, RaiseError => 0 }
) or die "Can't connect to the database: $DBI::errstr\n";
#--- Retrieved data for processing. ---#
$sql = qq{
SELECT server, dataid, description
FROM msg_queue_$cpu_stack
ORDER BY dts ASC LIMIT 10;
};
if($dumper) { warn Dumper($sql) };
$sth = $dbh->prepare( $sql ) or die "Can't prepare SQL statement: $DBI::errstr\n";
$sth->execute() or die "Can't execute SQL statement: $DBI::errstr\n";
$sth->bind_col( 1, \$server );
$sth->bind_col( 2, \$dataid );
$sth->bind_col( 3, \$description );
$counter=0;
while ( $sth->fetch ) #--------------------------------------------------#
{
#--- warn Dumper($dataid, $description); ---#
&lingua_tagger_statistics($title,$description);
&insert_nlp_into_msg_queue($server,$dataid,$description);
$counter++;
} #----------------------------------------------------------------------#
#--- Delete Processed Data. ---#
$sql = qq{
DELETE FROM msg_queue_$cpu_stack
ORDER BY dts ASC LIMIT $counter;
};
if($dumper) { warn Dumper($sql) };
$rc = $dbh->do( $sql );
$sth = undef;
$rc = $dbh->disconnect();
if($counter < $pause) { print "Sleep $pause seconds\n"; sleep $pause; }
}
return "Total processed videos: $counter";
}
#-----------------------------------------------------------------------------#
sub insert_nlp_into_msg_queue
{
my($dbh, $sth, $sql);
my($server, $dataid, $description) = @_;
$dbh = DBI->connect( 'DBI:mysql:nlp:localhost','username','password',
{ PrintError => 0, RaiseError => 0 }
) or die "Can't connect to the database: $DBI::errstr\n";
$server = $dbh->quote($server);
$description = $dbh->quote($description);
$sql = qq{
INSERT DELAYED INTO msg_queue_nlp(server,dataid,noun_significance,noun_run_length,text_length,dts)
VALUES($server,'$dataid',$noun_significance,$noun_run_length,$text_length, now())
ON DUPLICATE KEY UPDATE i=i+1;
};
if($dumper) { warn Dumper($sql) }; warn Dumper($sql);
$dbh->do($sql);
$sth=undef;
$dbh->disconnect;
}
#------------------------------------------------------------------------------#
# lingua_tagger_statistics #
#------------------------------------------------------------------------------#
sub lingua_tagger_statistics
{
my($title, $description) = @_;
my($dbh,$sth,$sql, $counter);
my $usa = new Number::Format( -thousands_sep => ','
, -decimal_point => '.'
, -int_curr_symbol => 'USA');
$text_length = length($description);
print "\nDescription Length: " . $text_length . "\n";
&compute_noun_run_length($description);
&compute_noun_significance($description);
print $tagger ."|". $noun_significance ."|". $noun_run_length;
}
#------------------------------------------------------------------------------#
# Calculate noun run length
#------------------------------------------------------------------------------#
sub compute_noun_run_length
{
my($text,$p,$tagged_text, $term,$speech,$counter,$flag, $tag);
$noun_run_length = 0; my $readable_text = "";
my %hash = (); my %nlp_max = ();
$text = shift;
$p = new Lingua::EN::Tagger;
$tagged_text = $p->add_tags($text);
#-- print $tagged_text . "\n";
$readable_text = $p->get_readable( $text );
print "\n" . $readable_text . "\n\n" if $debug;
my @array = split(/ /, $readable_text);
$flag=0;
foreach(@array)
{
($term,$speech) = split(/\//,$_);
$speech = substr($speech, 0,2);
if($speech =~ /NN/)
{
$counter++;
if($counter > $noun_run_length) { $noun_run_length = $counter; }
print $speech . " " if $debug;
$flag=1;
}
else
{
print $speech . " " if $debug;
$counter=0;
$flag=0;
}
}
print "\n";
}
#------------------------------------------------------------------------------#
# Calculating spam approximity of text
#------------------------------------------------------------------------------#
sub compute_noun_significance
{
my($text) = @_;
my( $p, $tagged_text,$term, $pos, $usa);
my($percent,$total_count,$total_nouns,$readable_text)= (0,0,0,'');
my %word_list = ();
my %hash = ();
$p = new Lingua::EN::Tagger;
$tagged_text = $p->add_tags( $text );
#--- %word_list = $p->get_words( $text );
#--- foreach(%word_list) { print $_ . "\n"; }
$readable_text = $p->get_readable( $text );
#--- print "\n" . $readable_text . "\n";
my @array = split(/ /, $readable_text);
foreach(@array)
{
#-- print $_ . "\n";
($term,$pos) = split(/\//,$_);
#-- print $term . "\t" . $pos . "\n";
$hash{$pos} += 1;
}
#--- get the total count ---#
for my $key (keys(%hash))
{
$total_count += $hash{$key};
}
$tagger = $total_count;
print "\nTotal=" . $total_count . "\n" if $debug;
#--- print values ---#
for my $key (keys(%hash))
{
$percent = ($hash{$key} / $total_count) * 100;
if($key =~ /(NN)/) { $total_nouns += $percent; }
}
$usa = new Number::Format( -thousands_sep => ','
, -decimal_point => '.'
, -int_curr_symbol => 'USA');
$noun_significance = $usa->format_number($total_nouns);
print "Significance=" . $noun_significance . "%\n" if $debug;
}
#------------------------------------------------------------------------------#
1;
__END__
"
CC Conjunction, coordinating and, or
CD Adjective, cardinal number 3, fifteen
DET Determiner this, each, some
EX Pronoun, existential there there
FW Foreign words
IN Preposition / Conjunction for, of, although, that
JJ Adjective happy, bad
JJR Adjective, comparative happier, worse
JJS Adjective, superlative happiest, worst
LS Symbol, list item A, A.
MD Verb, modal can, could, 'll
NN Noun aircraft, data
NNP Noun, proper London, Michael
NNPS Noun, proper, plural Australians, Methodists
NNS Noun, plural women, books
PDT Determiner, prequalifier quite, all, half
POS Possessive 's, '
PRP Determiner, possessive second mine, yours
PRPS Determiner, possessive their, your
RB Adverb often, not, very, here
RBR Adverb, comparative faster
RBS Adverb, superlative fastest
RP Adverb, particle up, off, out
SYM Symbol *
TO Preposition to
UH Interjection oh, yes, mmm
VB Verb, infinitive take, live
VBD Verb, past tense took, lived
VBG Verb, gerund taking, living
VBN Verb, past/passive participle taken, lived
VBP Verb, base present form take, live
VBZ Verb, present 3SG -s form takes, lives
WDT Determiner, question which, whatever
WP Pronoun, question who, whoever
WPS Determiner, possessive & question whose
WRB Adverb, question when, how, however
PP Punctuation, sentence ender ., !, ?
PPC Punctuation, comma ,
PPD Punctuation, dollar sign $
PPL Punctuation, quotation mark left ``
PPR Punctuation, quotation mark right ''
PPS Punctuation, colon, semicolon, elipsis :, ..., -
LRB Punctuation, left bracket (, {, [
RRB Punctuation, right bracket ), }, ]
"
|