diff --git a/.gitignore b/.gitignore index dd59c1a..1ac12f6 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ composer.phar composer.lock vagrant_ansible_inventory_default Vagrantfile -.DS_Store \ No newline at end of file +.DS_Store +/var/cache/rector \ No newline at end of file diff --git a/composer.json b/composer.json index 3fa31b9..40053a6 100644 --- a/composer.json +++ b/composer.json @@ -13,8 +13,8 @@ } ], "require": { - "php": ">=5.5.9" - }, + "php": ">=8.3" + }, "autoload": { "psr-4": { "Sentiment\\": "src/" @@ -24,5 +24,32 @@ "psr-4": { "Sentiment\\Tests\\": "tests/" } + }, + "require-dev": { + "rector/rector": "^2.1", + "phpstan/phpstan": "^2.1", + "pestphp/pest": "*", + "laravel/pint": "^1.25" + }, + "scripts": { + "rector": "vendor/bin/rector process", + "rector-dry": "vendor/bin/rector process --dry-run", + "phpstan": "vendor/bin/phpstan analyse", + "phpstan-baseline": "vendor/bin/phpstan analyse --generate-baseline", + "test": "vendor/bin/pest", + "test-coverage": "vendor/bin/pest --coverage", + "pint": "vendor/bin/pint", + "pint-test": "vendor/bin/pint --test", + "ready": [ + "@rector", + "@pint", + "@phpstan", + "@test" + ] + }, + "config": { + "allow-plugins": { + "pestphp/pest-plugin": true + } } -} \ No newline at end of file +} diff --git a/phpstan.neon b/phpstan.neon new file mode 100644 index 0000000..3f85672 --- /dev/null +++ b/phpstan.neon @@ -0,0 +1,6 @@ +parameters: + level: 5 + paths: + - src + tmpDir: var/cache/phpstan + reportUnmatchedIgnoredErrors: false \ No newline at end of file diff --git a/phpunit.xml b/phpunit.xml new file mode 100644 index 0000000..e6198e0 --- /dev/null +++ b/phpunit.xml @@ -0,0 +1,18 @@ + + + + + ./tests + + + + + app + src + + + diff --git a/pint.json b/pint.json new file mode 100644 index 0000000..156df32 --- /dev/null +++ b/pint.json @@ -0,0 +1,110 @@ +{ + "preset": "psr12", + "exclude": [ + "vendor", + "var" + ], + "rules": { + "array_syntax": { + "syntax": "short" + }, + "binary_operator_spaces": { + "default": "single_space" + }, + "blank_line_after_namespace": true, + "blank_line_after_opening_tag": true, + "blank_line_before_statement": { + "statements": ["return"] + }, + "braces": { + "allow_single_line_closure": true, + "position_after_functions_and_oop_constructs": "next", + "position_after_control_structures": "same", + "position_after_anonymous_constructs": "same" + }, + "cast_spaces": { + "space": "single" + }, + "class_attributes_separation": { + "elements": { + "method": "one" + } + }, + "concat_space": { + "spacing": "none" + }, + "declare_equal_normalize": true, + "function_typehint_space": true, + "single_line_comment_style": { + "comment_types": ["hash"] + }, + "include": true, + "lowercase_cast": true, + "lowercase_static_reference": true, + "magic_constant_casing": true, + "method_argument_space": { + "on_multiline": "ensure_fully_multiline" + }, + "method_chaining_indentation": true, + "no_blank_lines_after_class_opening": true, + "no_blank_lines_after_phpdoc": true, + "no_empty_phpdoc": true, + "no_empty_statement": true, + "no_extra_blank_lines": { + "tokens": [ + "extra", + "throw", + "use" + ] + }, + "no_leading_import_slash": true, + "no_leading_namespace_whitespace": true, + "no_mixed_echo_print": true, + "no_multiline_whitespace_around_double_arrow": true, + "no_short_bool_cast": true, + "no_singleline_whitespace_before_semicolons": true, + "no_spaces_around_offset": true, + "no_trailing_comma_in_singleline": true, + "no_unneeded_control_parentheses": true, + "no_unneeded_curly_braces": true, + "no_unused_imports": true, + "no_whitespace_before_comma_in_array": true, + "no_whitespace_in_blank_line": true, + "normalize_index_brace": true, + "object_operator_without_whitespace": true, + "ordered_imports": { + "sort_algorithm": "alpha" + }, + "php_unit_fqcn_annotation": true, + "phpdoc_align": { + "align": "vertical" + }, + "phpdoc_annotation_without_dot": true, + "phpdoc_indent": true, + "phpdoc_inline_tag_normalizer": true, + "phpdoc_no_access": true, + "phpdoc_no_alias_tag": true, + "phpdoc_no_package": true, + "phpdoc_no_useless_inheritdoc": true, + "phpdoc_return_self_reference": true, + "phpdoc_scalar": true, + "phpdoc_single_line_var_spacing": true, + "phpdoc_trim": true, + "phpdoc_types": true, + "phpdoc_var_without_name": true, + "return_type_declaration": true, + "single_class_element_per_statement": true, + "single_import_per_statement": true, + "single_line_after_imports": true, + "single_quote": true, + "space_after_semicolon": { + "remove_in_empty_for_expressions": true + }, + "standardize_not_equals": true, + "ternary_operator_spaces": true, + "trailing_comma_in_multiline": true, + "trim_array_spaces": true, + "unary_operator_spaces": true, + "whitespace_after_comma_in_array": true + } +} \ No newline at end of file diff --git a/rector.php b/rector.php new file mode 100644 index 0000000..7f08411 --- /dev/null +++ b/rector.php @@ -0,0 +1,31 @@ +withPaths([ + __DIR__.'/src', + ]) + ->withPhpSets( + php83: true + ) + ->withSets([ + // Apply all PHP version upgrades up to PHP 8.3 + LevelSetList::UP_TO_PHP_83, + + // Code quality improvements + SetList::CODE_QUALITY, + SetList::DEAD_CODE, + SetList::EARLY_RETURN, + SetList::TYPE_DECLARATION, + + // Modern PHP practices + SetList::PRIVATIZATION, + SetList::NAMING, + ]) + ->withParallel() + ->withCache(__DIR__.'/var/cache/rector'); diff --git a/src/Analyzer.php b/src/Analyzer.php index 4088b7b..6ec941b 100644 --- a/src/Analyzer.php +++ b/src/Analyzer.php @@ -11,18 +11,26 @@ class Analyzer { - private $lexicon_file = ""; - private $lexicon = ""; + /** + * @var string + */ + public $emoji_lexicon; + /** + * @var mixed[] + */ + public $emojis; + private readonly string $lexicon_file; + private array $lexicon; - private $current_sentitext = null; + private ?\Sentiment\Procedures\SentiText $sentiText = null; - public function __construct($lexicon_file = "Lexicons/vader_sentiment_lexicon.txt",$emoji_lexicon='Lexicons/emoji_utf8_lexicon.txt') + public function __construct(string $lexicon_file = 'Lexicons/vader_sentiment_lexicon.txt', string $emoji_lexicon = 'Lexicons/emoji_utf8_lexicon.txt') { //Not sure about this as it forces lexicon file to be in the same directory as executing script - $this->lexicon_file = __DIR__ . DIRECTORY_SEPARATOR . $lexicon_file; + $this->lexicon_file = __DIR__.DIRECTORY_SEPARATOR.$lexicon_file; $this->lexicon = $this->make_lex_dict(); - $this->emoji_lexicon = __DIR__ . DIRECTORY_SEPARATOR .$emoji_lexicon; + $this->emoji_lexicon = __DIR__.DIRECTORY_SEPARATOR.$emoji_lexicon; $this->emojis = $this->make_emoji_dict(); } @@ -30,35 +38,32 @@ public function __construct($lexicon_file = "Lexicons/vader_sentiment_lexicon.tx /* Determine if input contains negation words */ - public function IsNegated($wordToTest, $include_nt = true) + public function IsNegated($wordToTest, $include_nt = true): bool { - $wordToTest = strtolower($wordToTest); + $wordToTest = strtolower((string) $wordToTest); if (in_array($wordToTest, Config::NEGATE)) { return true; } - if ($include_nt) { - if (strpos($wordToTest, "n't")) { - return true; - } - } - - return false; + return $include_nt && strpos($wordToTest, "n't"); } /* Convert lexicon file to a dictionary */ - public function make_lex_dict() + /** + * @return string[] + */ + public function make_lex_dict(): array { $lex_dict = []; - $fp = fopen($this->lexicon_file, "r"); + $fp = fopen($this->lexicon_file, 'r'); if (!$fp) { - die("Cannot load lexicon file"); + die('Cannot load lexicon file'); } while (($line = fgets($fp, 4096)) !== false) { - list($word, $measure) = explode("\t", trim($line)); + [$word, $measure] = explode("\t", trim($line)); //.strip().split('\t')[0:2] $lex_dict[$word] = $measure; //lex_dict[word] = float(measure) @@ -67,95 +72,97 @@ public function make_lex_dict() return $lex_dict; } - - public function make_emoji_dict() { + /** + * @return string[] + */ + public function make_emoji_dict(): array + { $emoji_dict = []; - $fp = fopen($this->emoji_lexicon, "r"); + $fp = fopen($this->emoji_lexicon, 'r'); if (!$fp) { - die("Cannot load emoji lexicon file"); + die('Cannot load emoji lexicon file'); } while (($line = fgets($fp, 4096)) !== false) { - list($emoji, $description) = explode("\t", trim($line)); + [$emoji, $description] = explode("\t", trim($line)); //.strip().split('\t')[0:2] $emoji_dict[$emoji] = $description; //lex_dict[word] = float(measure) } + return $emoji_dict; } - public function updateLexicon($arr) + public function updateLexicon($arr): ?array { - if(!is_array($arr)) return []; - $lexicon = []; + if (!is_array($arr)) { + return []; + } foreach ($arr as $word => $valence) { - $this->lexicon[strtolower($word)] = is_numeric($valence)? $valence : 0; + $this->lexicon[strtolower((string) $word)] = is_numeric($valence) ? $valence : 0; } - } - private function IsKindOf($firstWord, $secondWord) - { - return "kind" === strtolower($firstWord) && "of" === strtolower($secondWord); + return null; } - private function IsBoosterWord($word) + private function IsBoosterWord($word): bool { - return array_key_exists(strtolower($word), Config::BOOSTER_DICT); + return array_key_exists(strtolower((string) $word), Config::BOOSTER_DICT); } - private function getBoosterScaler($word) + private function getBoosterScaler($word): float { - return Config::BOOSTER_DICT[strtolower($word)]; + return Config::BOOSTER_DICT[strtolower((string) $word)]; } - private function IsInLexicon($word) + private function IsInLexicon($word): bool { - $lowercase = strtolower($word); + $lowercase = strtolower((string) $word); return array_key_exists($lowercase, $this->lexicon); } - private function IsUpperCaseWord($word) + private function IsUpperCaseWord($word): bool { - return ctype_upper($word); + return ctype_upper((string) $word); } private function getValenceFromLexicon($word) { - return $this->lexicon[strtolower($word)]; + return $this->lexicon[strtolower((string) $word)]; } - private function getTargetWordFromContext($wordInContext) + private function getTargetWordFromContext(array $wordInContext) { - return $wordInContext[count($wordInContext)-1]; + return $wordInContext[count($wordInContext) - 1]; } /* Gets the precedding two words to check for emphasis */ - private function getWordInContext($wordList, $currentWordPosition) + private function getWordInContext($wordList, int $currentWordPosition): array { - $precedingWordList =[]; + $precedingWordList = []; //push the actual word on to the context list array_unshift($precedingWordList, $wordList[$currentWordPosition]); //If the word position is greater than 2 then we know we are not going to overflow - if (($currentWordPosition-1)>=0) { - array_unshift($precedingWordList, $wordList[$currentWordPosition-1]); + if (($currentWordPosition - 1) >= 0) { + array_unshift($precedingWordList, $wordList[$currentWordPosition - 1]); } else { - array_unshift($precedingWordList, ""); + array_unshift($precedingWordList, ''); } - if (($currentWordPosition-2)>=0) { - array_unshift($precedingWordList, $wordList[$currentWordPosition-2]); + if (($currentWordPosition - 2) >= 0) { + array_unshift($precedingWordList, $wordList[$currentWordPosition - 2]); } else { - array_unshift($precedingWordList, ""); + array_unshift($precedingWordList, ''); } - if (($currentWordPosition-3)>=0) { - array_unshift($precedingWordList, $wordList[$currentWordPosition-3]); + if (($currentWordPosition - 3) >= 0) { + array_unshift($precedingWordList, $wordList[$currentWordPosition - 3]); } else { - array_unshift($precedingWordList, ""); + array_unshift($precedingWordList, ''); } return $precedingWordList; @@ -166,13 +173,12 @@ private function getWordInContext($wordList, $currentWordPosition) Positive values are positive valence, negative value are negative valence. */ - public function getSentiment($text) + public function getSentiment($text): array { - $text_no_emoji = ''; $prev_space = true; - foreach($this->str_split_unicode($text) as $unichr ) { + foreach ($this->str_split_unicode($text) as $unichr) { if (array_key_exists($unichr, $this->emojis)) { $description = $this->emojis[$unichr]; if (!($prev_space)) { @@ -180,41 +186,36 @@ public function getSentiment($text) } $text_no_emoji .= $description; $prev_space = false; - } - else { + } else { $text_no_emoji .= $unichr; $prev_space = ($unichr == ' '); } } $text = trim($text_no_emoji); - $this->current_sentitext = new SentiText($text); + $this->sentiText = new SentiText($text); $sentiments = []; - $words_and_emoticons = $this->current_sentitext->words_and_emoticons; + $words_and_emoticons = $this->sentiText->words_and_emoticons; - for ($i=0; $i<=count($words_and_emoticons)-1; $i++) { + for ($i = 0; $i <= count($words_and_emoticons) - 1; $i++) { $valence = 0.0; $wordBeingTested = $words_and_emoticons[$i]; //If this is a booster word add a 0 valances then go to next word as it does not express sentiment directly - /* if ($this->IsBoosterWord($wordBeingTested)){ - echo "\t\tThe word is a booster word: setting sentiment to 0.0\n"; - }*/ - //var_dump($i); + /* if ($this->IsBoosterWord($wordBeingTested)){ + echo "\t\tThe word is a booster word: setting sentiment to 0.0\n"; + }*/ + //var_dump($i); //If the word is not in the Lexicon then it does not express sentiment. So just ignore it. - if ($this->IsInLexicon($wordBeingTested)) { - - //Special case because kind is in the lexicon so the modifier kind of needs to be skipped - if ("kind" !=$words_and_emoticons[$i] && "of" != $words_and_emoticons[$i]) { - $valence = $this->getValenceFromLexicon($wordBeingTested); - - $wordInContext = $this->getWordInContext($words_and_emoticons, $i); - //If we are here then we have a word that enhance booster words - $valence = $this->adjustBoosterSentiment($wordInContext, $valence); - } + //Special case because kind is in the lexicon so the modifier kind of needs to be skipped + if ($this->IsInLexicon($wordBeingTested) && ('kind' != $words_and_emoticons[$i] && 'of' != $words_and_emoticons[$i])) { + $valence = $this->getValenceFromLexicon($wordBeingTested); + $wordInContext = $this->getWordInContext($words_and_emoticons, $i); + //If we are here then we have a word that enhance booster words + $valence = $this->adjustBoosterSentiment($wordInContext, $valence); } - array_push($sentiments, $valence); + $sentiments[] = $valence; } //Once we have a sentiment for each word adjust the sentimest if but is present $sentiments = $this->_but_check($words_and_emoticons, $sentiments); @@ -222,23 +223,24 @@ public function getSentiment($text) return $this->score_valence($sentiments, $text); } - - private function str_split_unicode($str, $l = 0) { + private function str_split_unicode($str, $l = 0) + { if ($l > 0) { - $ret = array(); - $len = mb_strlen($str, "UTF-8"); + $ret = []; + $len = mb_strlen((string) $str, 'UTF-8'); for ($i = 0; $i < $len; $i += $l) { - $ret[] = mb_substr($str, $i, $l, "UTF-8"); + $ret[] = mb_substr((string) $str, $i, $l, 'UTF-8'); } + return $ret; } - return preg_split("//u", $str, -1, PREG_SPLIT_NO_EMPTY); - } + return preg_split('//u', (string) $str, -1, PREG_SPLIT_NO_EMPTY); + } private function applyValenceCapsBoost($targetWord, $valence) { - if ($this->IsUpperCaseWord($targetWord) && $this->current_sentitext->is_cap_diff) { + if ($this->IsUpperCaseWord($targetWord) && $this->sentiText->is_cap_diff) { if ($valence > 0) { $valence += Config::C_INCR; } else { @@ -274,24 +276,24 @@ private function boosterScaleAdjustment($word, $valence) // dampen the scalar modifier of preceding words and emoticons // (excluding the ones that immediately preceed the item) based // on their distance from the current item. - private function dampendBoosterScalerByPosition($booster, $position) + private function dampendBoosterScalerByPosition($booster, int $position) { - if (0===$booster) { + if (0 === $booster) { return $booster; } - if (1==$position) { - return $booster*0.95; + if (1 == $position) { + return $booster * 0.95; } - if (2==$position) { - return $booster*0.9; + if (2 == $position) { + return $booster * 0.9; } return $booster; } - private function adjustBoosterSentiment($wordInContext, $valence) + private function adjustBoosterSentiment(array $wordInContext, $valence) { //The target word is always the last word $targetWord = $this->getTargetWordFromContext($wordInContext); @@ -299,21 +301,19 @@ private function adjustBoosterSentiment($wordInContext, $valence) //check if sentiment laden word is in ALL CAPS (while others aren't) and apply booster $valence = $this->applyValenceCapsBoost($targetWord, $valence); - $valence = $this->modifyValenceBasedOnContext($wordInContext, $valence); - - return $valence; + return $this->modifyValenceBasedOnContext($wordInContext, $valence); } - private function modifyValenceBasedOnContext($wordInContext, $valence) + private function modifyValenceBasedOnContext(array $wordInContext, $valence) { - $wordToTest = $this->getTargetWordFromContext($wordInContext); - //if($this->IsInLexicon($wordToTest)){ - // continue; - //} - for ($i=0; $igetTargetWordFromContext($wordInContext); + //if($this->IsInLexicon($wordToTest)){ + // continue; + //} + for ($i = 0; $i < count($wordInContext) - 1; $i++) { $scalarValue = $this->boosterScaleAdjustment($wordInContext[$i], $valence); $scalarValue = $this->dampendBoosterScalerByPosition($scalarValue, $i); - $valence = $valence+$scalarValue; + $valence += $scalarValue; } $valence = $this->_never_check($wordInContext, $valence); @@ -336,11 +336,9 @@ public function _least_check($wordInContext, $valence) { // check for negation case using "least" //if the previous word is least" - if (strtolower($wordInContext[2]) == "least") { - //but not "at least {word}" "very least {word}" - if (strtolower($wordInContext[1]) != "at" && strtolower($wordInContext[1]) != "very") { - $valence = $valence*Config::N_SCALAR; - } + //but not "at least {word}" "very least {word}" + if (strtolower((string) $wordInContext[2]) === 'least' && (strtolower((string) $wordInContext[1]) !== 'at' && strtolower((string) $wordInContext[1]) !== 'very')) { + $valence *= Config::N_SCALAR; } return $valence; @@ -349,16 +347,17 @@ public function _least_check($wordInContext, $valence) public function _but_check($words_and_emoticons, $sentiments) { // check for modification in sentiment due to contrastive conjunction 'but' - $bi = array_search("but", $words_and_emoticons); + $bi = array_search('but', $words_and_emoticons); if (!$bi) { - $bi = array_search("BUT", $words_and_emoticons); + $bi = array_search('BUT', $words_and_emoticons); } if ($bi) { - for ($si=0; $si$bi) { - $sentiments[$si] = $sentiments[$si]*1.5; + $counter = count($sentiments); + for ($si = 0; $si < $counter; $si++) { + if ($si < $bi) { + $sentiments[$si] *= 0.5; + } elseif ($si > $bi) { + $sentiments[$si] *= 1.5; } } } @@ -368,48 +367,44 @@ public function _but_check($words_and_emoticons, $sentiments) public function _idioms_check($wordInContext, $valence) { - $onezero = sprintf("%s %s", $wordInContext[2], $wordInContext[3]); - - $twoonezero = sprintf("%s %s %s", $wordInContext[1], $wordInContext[2], $wordInContext[3]); - - $twoone = sprintf("%s %s", $wordInContext[1], $wordInContext[2]); + $onezero = sprintf('%s %s', $wordInContext[2], $wordInContext[3]); - $threetwoone = sprintf("%s %s %s", $wordInContext[0], $wordInContext[1], $wordInContext[2]); + $twoonezero = sprintf('%s %s %s', $wordInContext[1], $wordInContext[2], $wordInContext[3]); - $threetwo = sprintf("%s %s", $wordInContext[0], $wordInContext[1]); + $twoone = sprintf('%s %s', $wordInContext[1], $wordInContext[2]); - $zeroone = sprintf("%s %s", $wordInContext[3], $wordInContext[2]); + $threetwoone = sprintf('%s %s %s', $wordInContext[0], $wordInContext[1], $wordInContext[2]); - $zeroonetwo = sprintf("%s %s %s", $wordInContext[3], $wordInContext[2], $wordInContext[1]); + $threetwo = sprintf('%s %s', $wordInContext[0], $wordInContext[1]); $sequences = [$onezero, $twoonezero, $twoone, $threetwoone, $threetwo]; - foreach ($sequences as $seq) { - $key = strtolower($seq); + foreach ($sequences as $sequence) { + $key = strtolower($sequence); if (array_key_exists($key, Config::SPECIAL_CASE_IDIOMS)) { $valence = Config::SPECIAL_CASE_IDIOMS[$key]; break; } -/* - Positive idioms check. Not implementing it yet - if(count($words_and_emoticons)-1 > $i){ - $zeroone = sprintf("%s %s",$words_and_emoticons[$i], $words_and_emoticons[$i+1]); - if (in_array($zeroone, Config::SPECIAL_CASE_IDIOMS)){ - $valence = Config::SPECIAL_CASE_IDIOMS[$zeroone]; - } - } - if(count($words_and_emoticons)-1 > $i+1){ - $zeroonetwo = sprintf("%s %s %s",$words_and_emoticons[$i], $words_and_emoticons[$i+1], $words_and_emoticons[$i+2]); - if (in_array($zeroonetwo, Config::SPECIAL_CASE_IDIOMS)){ - $valence = Config::SPECIAL_CASE_IDIOMS[$zeroonetwo]; - } - } -*/ + /* + Positive idioms check. Not implementing it yet + if(count($words_and_emoticons)-1 > $i){ + $zeroone = sprintf("%s %s",$words_and_emoticons[$i], $words_and_emoticons[$i+1]); + if (in_array($zeroone, Config::SPECIAL_CASE_IDIOMS)){ + $valence = Config::SPECIAL_CASE_IDIOMS[$zeroone]; + } + } + if(count($words_and_emoticons)-1 > $i+1){ + $zeroonetwo = sprintf("%s %s %s",$words_and_emoticons[$i], $words_and_emoticons[$i+1], $words_and_emoticons[$i+2]); + if (in_array($zeroonetwo, Config::SPECIAL_CASE_IDIOMS)){ + $valence = Config::SPECIAL_CASE_IDIOMS[$zeroonetwo]; + } + } + */ // check for booster/dampener bi-grams such as 'sort of' or 'kind of' if ($this->IsBoosterWord($threetwo) || $this->IsBoosterWord($twoone)) { - $valence = $valence+Config::B_DECR; + $valence += Config::B_DECR; } } @@ -420,12 +415,12 @@ public function _never_check($wordInContext, $valance) { //If the sentiment word is preceded by never so/this we apply a modifier $neverModifier = 0; - if ("never" == $wordInContext[0]) { + if ('never' == $wordInContext[0]) { $neverModifier = 1.25; - } else if ("never" == $wordInContext[1]) { + } elseif ('never' == $wordInContext[1]) { $neverModifier = 1.5; } - if ("so" == $wordInContext[1] || "so"== $wordInContext[2] || "this" == $wordInContext[1] || "this" == $wordInContext[2]) { + if ('so' == $wordInContext[1] || 'so' == $wordInContext[2] || 'this' == $wordInContext[1] || 'this' == $wordInContext[2]) { $valance *= $neverModifier; } @@ -439,58 +434,58 @@ public function _never_check($wordInContext, $valance) return $valance; } - public function _sentiment_laden_idioms_check($valence, $senti_text_lower){ - # Future Work - # check for sentiment laden idioms that don't contain a lexicon word + public function _sentiment_laden_idioms_check($valence, $senti_text_lower): float|int + { + // Future Work + // check for sentiment laden idioms that don't contain a lexicon word $idioms_valences = []; - foreach (Config::SENTIMENT_LADEN_IDIOMS as $idiom) { - if(in_array($idiom, $senti_text_lower)){ + foreach (Config::SENTIMENT_LADEN_IDIOMS as $idiom => $valence) { + if (in_array($idiom, $senti_text_lower)) { //print($idiom, $senti_text_lower) - $valence = Config::SENTIMENT_LADEN_IDIOMS[$idiom]; $idioms_valences[] = $valence; } } - if ((strlen($idioms_valences) > 0)) { - $valence = ( array_sum( explode( ',', $idioms_valences ) ) / floatval(strlen($idioms_valences))); + if (count($idioms_valences) > 0) { + return array_sum($idioms_valences) / floatval(count($idioms_valences)); } + return $valence; } - public function _punctuation_emphasis($sum_s, $text) + public function _punctuation_emphasis($sum_s, $text): float|int { // add emphasis from exclamation points and question marks $ep_amplifier = $this->_amplify_ep($text); $qm_amplifier = $this->_amplify_qm($text); - $punct_emph_amplifier = $ep_amplifier+$qm_amplifier; - return $punct_emph_amplifier; + return $ep_amplifier + $qm_amplifier; } - public function _amplify_ep($text) + public function _amplify_ep($text): float { // check for added emphasis resulting from exclamation points (up to 4 of them) - $ep_count = substr_count($text, "!"); + $ep_count = substr_count((string) $text, '!'); if ($ep_count > 4) { $ep_count = 4; } - # (empirically derived mean sentiment intensity rating increase for - # exclamation points) - $ep_amplifier = $ep_count*0.292; + // (empirically derived mean sentiment intensity rating increase for + // exclamation points) + $ep_amplifier = $ep_count * 0.292; return $ep_amplifier; } - public function _amplify_qm($text) + public function _amplify_qm($text): float|int { - # check for added emphasis resulting from question marks (2 or 3+) - $qm_count = substr_count($text, "?"); + // check for added emphasis resulting from question marks (2 or 3+) + $qm_count = substr_count((string) $text, '?'); $qm_amplifier = 0; if ($qm_count > 1) { if ($qm_count <= 3) { - # (empirically derived mean sentiment intensity rating increase for - # question marks) - $qm_amplifier = $qm_count*0.18; + // (empirically derived mean sentiment intensity rating increase for + // question marks) + $qm_amplifier = $qm_count * 0.18; } else { $qm_amplifier = 0.96; } @@ -499,20 +494,20 @@ public function _amplify_qm($text) return $qm_amplifier; } - public function _sift_sentiment_scores($sentiments) + public function _sift_sentiment_scores($sentiments): array { - # want separate positive versus negative sentiment scores + // want separate positive versus negative sentiment scores $pos_sum = 0.0; $neg_sum = 0.0; $neu_count = 0; - foreach ($sentiments as $sentiment_score) { - if ($sentiment_score > 0) { - $pos_sum += $sentiment_score +1; # compensates for neutral words that are counted as 1 + foreach ($sentiments as $sentiment) { + if ($sentiment > 0) { + $pos_sum += $sentiment + 1; // compensates for neutral words that are counted as 1 } - if ($sentiment_score < 0) { - $neg_sum += $sentiment_score -1; # when used with math.fabs(), compensates for neutrals + if ($sentiment < 0) { + $neg_sum += $sentiment - 1; // when used with math.fabs(), compensates for neutrals } - if ($sentiment_score == 0) { + if ($sentiment == 0) { $neu_count += 1; } } @@ -520,11 +515,11 @@ public function _sift_sentiment_scores($sentiments) return [$pos_sum, $neg_sum, $neu_count]; } - public function score_valence($sentiments, $text) + public function score_valence($sentiments, $text): array { if ($sentiments) { $sum_s = array_sum($sentiments); - # compute and add emphasis from punctuation in text + // compute and add emphasis from punctuation in text $punct_emph_amplifier = $this->_punctuation_emphasis($sum_s, $text); if ($sum_s > 0) { $sum_s += $punct_emph_amplifier; @@ -533,8 +528,8 @@ public function score_valence($sentiments, $text) } $compound = Config::normalize($sum_s); - # discriminate between positive, negative and neutral sentiment scores - list($pos_sum, $neg_sum, $neu_count) = $this->_sift_sentiment_scores($sentiments); + // discriminate between positive, negative and neutral sentiment scores + [$pos_sum, $neg_sum, $neu_count] = $this->_sift_sentiment_scores($sentiments); if ($pos_sum > abs($neg_sum)) { $pos_sum += $punct_emph_amplifier; @@ -543,7 +538,7 @@ public function score_valence($sentiments, $text) } $total = $pos_sum + abs($neg_sum) + $neu_count; - $pos =abs($pos_sum / $total); + $pos = abs($pos_sum / $total); $neg = abs($neg_sum / $total); $neu = abs($neu_count / $total); } else { @@ -553,12 +548,9 @@ public function score_valence($sentiments, $text) $neu = 0.0; } - $sentiment_dict = - ["neg" => round($neg, 3), - "neu" => round($neu, 3), - "pos" => round($pos, 3), - "compound" => round($compound, 4)]; - - return $sentiment_dict; + return ['neg' => round($neg, 3), + 'neu' => round($neu, 3), + 'pos' => round($pos, 3), + 'compound' => round($compound, 4)]; } } diff --git a/src/Config/Config.php b/src/Config/Config.php index 05f0304..afe529a 100644 --- a/src/Config/Config.php +++ b/src/Config/Config.php @@ -7,69 +7,65 @@ */ class Config { - - // (empirically derived mean sentiment intensity rating increase for booster words) - const B_INCR = 0.293; - const B_DECR = -0.293; + public const B_INCR = 0.293; + public const B_DECR = -0.293; // (empirically derived mean sentiment intensity rating increase for using // ALLCAPs to emphasize a word) - const C_INCR = 0.733; + public const C_INCR = 0.733; - const N_SCALAR = -0.74; + public const N_SCALAR = -0.74; // for removing punctuation //const REGEX_REMOVE_PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation)) - - const NEGATE = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", + + public const NEGATE = ['aint', 'arent', 'cannot', 'cant', 'couldnt', 'darent', 'didnt', 'doesnt', "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't", - "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither", + 'dont', 'hadnt', 'hasnt', 'havent', 'isnt', 'mightnt', 'mustnt', 'neither', "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", - "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere", - "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent", - "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't", - "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]; + 'neednt', "needn't", 'never', 'none', 'nope', 'nor', 'not', 'nothing', 'nowhere', + 'oughtnt', 'shant', 'shouldnt', 'uhuh', 'wasnt', 'werent', + "oughtn't", "shan't", "shouldn't", 'uh-uh', "wasn't", "weren't", + 'without', 'wont', 'wouldnt', "won't", "wouldn't", 'rarely', 'seldom', 'despite']; //booster/dampener 'intensifiers' or 'degree adverbs' //http://en.wiktionary.org/wiki/Category:English_degree_adverbs - const BOOSTER_DICT = ["absolutely"=> self::B_INCR, "amazingly"=> self::B_INCR, "awfully"=> self::B_INCR, "completely"=> self::B_INCR, "considerably"=> self::B_INCR, - "decidedly"=> self::B_INCR, "deeply"=> self::B_INCR, "effing"=> self::B_INCR,"enormous"=> self::B_INCR, "enormously"=> self::B_INCR, - "entirely"=> self::B_INCR, "especially"=> self::B_INCR, "exceptionally"=> self::B_INCR, "extremely"=> self::B_INCR, - "fabulously"=> self::B_INCR, "flipping"=> self::B_INCR, "flippin"=> self::B_INCR, - "fricking"=> self::B_INCR, "frickin"=> self::B_INCR, "frigging"=> self::B_INCR, "friggin"=> self::B_INCR, "fully"=> self::B_INCR, "fucking"=> self::B_INCR, - "greatly"=> self::B_INCR, "hella"=> self::B_INCR, "highly"=> self::B_INCR, "hugely"=> self::B_INCR, "incredibly"=> self::B_INCR, - "intensely"=> self::B_INCR, "majorly"=> self::B_INCR, "more"=> self::B_INCR, "most"=> self::B_INCR, "particularly"=> self::B_INCR, - "purely"=> self::B_INCR, "quite"=> self::B_INCR, "seemingly" => self::B_INCR, "really"=> self::B_INCR, "remarkably"=> self::B_INCR, - "so"=> self::B_INCR, "substantially"=> self::B_INCR, - "thoroughly"=> self::B_INCR, "totally"=> self::B_INCR, "tremendous"=> self::B_INCR, "tremendously"=> self::B_INCR, - "uber"=> self::B_INCR, "unbelievably"=> self::B_INCR, "unusually"=> self::B_INCR, "utterly"=> self::B_INCR, - "very"=> self::B_INCR, - "almost"=> self::B_DECR, "barely"=> self::B_DECR, "hardly"=> self::B_DECR, "just enough"=> self::B_DECR, - "kind of"=> self::B_DECR, "kinda"=> self::B_DECR, "kindof"=> self::B_DECR, "kind-of"=> self::B_DECR, - "less"=> self::B_DECR, "little"=> self::B_DECR, "marginally"=> self::B_DECR, "occasional"=> self::B_DECR, "occasionally"=> self::B_DECR, "partly"=> self::B_DECR, - "scarcely"=> self::B_DECR, "slightly"=> self::B_DECR, "somewhat"=> self::B_DECR, - "sort of"=> self::B_DECR, "sorta"=> self::B_DECR, "sortof"=> self::B_DECR, "sort-of"=> self::B_DECR]; - + public const BOOSTER_DICT = ['absolutely' => self::B_INCR, 'amazingly' => self::B_INCR, 'awfully' => self::B_INCR, 'completely' => self::B_INCR, 'considerably' => self::B_INCR, + 'decidedly' => self::B_INCR, 'deeply' => self::B_INCR, 'effing' => self::B_INCR, 'enormous' => self::B_INCR, 'enormously' => self::B_INCR, + 'entirely' => self::B_INCR, 'especially' => self::B_INCR, 'exceptionally' => self::B_INCR, 'extremely' => self::B_INCR, + 'fabulously' => self::B_INCR, 'flipping' => self::B_INCR, 'flippin' => self::B_INCR, + 'fricking' => self::B_INCR, 'frickin' => self::B_INCR, 'frigging' => self::B_INCR, 'friggin' => self::B_INCR, 'fully' => self::B_INCR, 'fucking' => self::B_INCR, + 'greatly' => self::B_INCR, 'hella' => self::B_INCR, 'highly' => self::B_INCR, 'hugely' => self::B_INCR, 'incredibly' => self::B_INCR, + 'intensely' => self::B_INCR, 'majorly' => self::B_INCR, 'more' => self::B_INCR, 'most' => self::B_INCR, 'particularly' => self::B_INCR, + 'purely' => self::B_INCR, 'quite' => self::B_INCR, 'seemingly' => self::B_INCR, 'really' => self::B_INCR, 'remarkably' => self::B_INCR, + 'so' => self::B_INCR, 'substantially' => self::B_INCR, + 'thoroughly' => self::B_INCR, 'totally' => self::B_INCR, 'tremendous' => self::B_INCR, 'tremendously' => self::B_INCR, + 'uber' => self::B_INCR, 'unbelievably' => self::B_INCR, 'unusually' => self::B_INCR, 'utterly' => self::B_INCR, + 'very' => self::B_INCR, + 'almost' => self::B_DECR, 'barely' => self::B_DECR, 'hardly' => self::B_DECR, 'just enough' => self::B_DECR, + 'kind of' => self::B_DECR, 'kinda' => self::B_DECR, 'kindof' => self::B_DECR, 'kind-of' => self::B_DECR, + 'less' => self::B_DECR, 'little' => self::B_DECR, 'marginally' => self::B_DECR, 'occasional' => self::B_DECR, 'occasionally' => self::B_DECR, 'partly' => self::B_DECR, + 'scarcely' => self::B_DECR, 'slightly' => self::B_DECR, 'somewhat' => self::B_DECR, + 'sort of' => self::B_DECR, 'sorta' => self::B_DECR, 'sortof' => self::B_DECR, 'sort-of' => self::B_DECR]; - # check for sentiment laden idioms that do not contain lexicon words (future work, not yet implemented) - const SENTIMENT_LADEN_IDIOMS = ["cut the mustard"=> 2, "hand to mouth"=> -2, - "back handed"=> -2, "blow smoke"=> -2, "blowing smoke"=> -2, - "upper hand"=> 1, "break a leg"=> 2, - "cooking with gas"=> 2, "in the black"=> 2, "in the red"=> -2, - "on the ball"=> 2, "under the weather"=> -2]; + // check for sentiment laden idioms that do not contain lexicon words (future work, not yet implemented) + public const SENTIMENT_LADEN_IDIOMS = ['cut the mustard' => 2, 'hand to mouth' => -2, + 'back handed' => -2, 'blow smoke' => -2, 'blowing smoke' => -2, + 'upper hand' => 1, 'break a leg' => 2, + 'cooking with gas' => 2, 'in the black' => 2, 'in the red' => -2, + 'on the ball' => 2, 'under the weather' => -2]; // check for special case idioms using a sentiment-laden keyword known to SAGE - const SPECIAL_CASE_IDIOMS = ["the shit"=> 3, "the bomb"=> 3, "bad ass"=> 1.5, "bus stop"=> 0.0, "yeah right"=> -2, "cut the mustard"=> 2, "kiss of death"=> -1.5, "hand to mouth"=> -2, "beating heart"=> 3.1,"broken heart"=> -2.9, "to die for"=> 3]; - ##Static methods## + public const SPECIAL_CASE_IDIOMS = ['the shit' => 3, 'the bomb' => 3, 'bad ass' => 1.5, 'bus stop' => 0.0, 'yeah right' => -2, 'cut the mustard' => 2, 'kiss of death' => -1.5, 'hand to mouth' => -2, 'beating heart' => 3.1, 'broken heart' => -2.9, 'to die for' => 3]; + //#Static methods## /* Normalize the score to be between -1 and 1 using an alpha that approximates the max expected value */ - public static function normalize($score, $alpha = 15) + public static function normalize($score, $alpha = 15): float { - $norm_score = $score/sqrt(($score*$score) + $alpha); - return $norm_score; + return $score / sqrt(($score * $score) + $alpha); } } diff --git a/src/Procedures/SentiText.php b/src/Procedures/SentiText.php index 7a0773f..71bec6d 100644 --- a/src/Procedures/SentiText.php +++ b/src/Procedures/SentiText.php @@ -8,22 +8,14 @@ class SentiText { + public $words_and_emoticons; + public $is_cap_diff; - private $text = ""; - public $words_and_emoticons = null; - public $is_cap_diff = null; + public const PUNC_LIST = ['.', '!', '?', ',', ';', ':', '-', "'", '"', + '!!', '!!!', '??', '???', '?!?', '!?!', '?!?!', '!?!?']; - const PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"", - "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]; - - - function __construct($text) + public function __construct(private $text) { - //checking that is string - //if (!isinstance(text, str)){ - // text = str(text.encode('utf-8')); - //} - $this->text = $text; $this->words_and_emoticons = $this->_words_and_emoticons(); // doesn't separate words from\ // adjacent punctuation (keeps emoticons & contractions) @@ -33,18 +25,19 @@ function __construct($text) /* Remove all punctation from a string */ - function strip_punctuation($string) + public function strip_punctuation($string): string|array|null { //$string = strtolower($string); - return preg_replace("/[[:punct:]]+/", "", $string); + return preg_replace('/[[:punct:]]+/', '', (string) $string); } - function array_count_values_of($haystack, $needle) + public function array_count_values_of($haystack, $needle): int { if (!in_array($needle, $haystack, true)) { return 0; } $counts = array_count_values($haystack); + return $counts[$needle]; } @@ -56,67 +49,62 @@ function array_count_values_of($haystack, $needle) */ private function allcap_differential($words) { - $is_different = false; $allcap_words = 0; foreach ($words as $word) { //ctype is affected by the local of the processor see manual for more details - if (ctype_upper($word)) { + if (ctype_upper((string) $word)) { $allcap_words += 1; } } $cap_differential = count($words) - $allcap_words; if ($cap_differential > 0 && $cap_differential < count($words)) { - $is_different = true; + return true; } + return $is_different; } - function _words_only() + public function _words_only() { $text_mod = $this->strip_punctuation($this->text); // removes punctuation (but loses emoticons & contractions) - $words_only = preg_split('/\s+/', $text_mod); - # get rid of empty items or single letter "words" like 'a' and 'I' - $works_only = array_filter($words_only, function ($word) { - return strlen($word) > 1; - }); + $words_only = preg_split('/\s+/', (string) $text_mod); + // get rid of empty items or single letter "words" like 'a' and 'I' + array_filter($words_only, fn ($word): bool => strlen($word) > 1); + return $words_only; } - function _words_and_emoticons() + public function _words_and_emoticons() { + $wes = preg_split('/\s+/', (string) $this->text); - $wes = preg_split('/\s+/', $this->text); - - # get rid of residual empty items or single letter words - $wes = array_filter($wes, function ($word) { - return strlen($word) > 1; - }); + // get rid of residual empty items or single letter words + $wes = array_filter($wes, fn ($word): bool => strlen($word) > 1); //Need to remap the indexes of the array $wes = array_values($wes); $words_only = $this->_words_only(); - foreach ($words_only as $word) { + foreach ($words_only as $word_only) { foreach (self::PUNC_LIST as $punct) { //replace all punct + word combinations with word - $pword = $punct .$word; - + $pword = $punct.$word_only; $x1 = $this->array_count_values_of($wes, $pword); while ($x1 > 0) { $i = array_search($pword, $wes, true); unset($wes[$i]); - array_splice($wes, $i, 0, $word); + array_splice($wes, $i, 0, $word_only); $x1 = $this->array_count_values_of($wes, $pword); } //Do the same as above but word then punct - $wordp = $word . $punct; + $wordp = $word_only.$punct; $x2 = $this->array_count_values_of($wes, $wordp); while ($x2 > 0) { $i = array_search($wordp, $wes, true); unset($wes[$i]); - array_splice($wes, $i, 0, $word); + array_splice($wes, $i, 0, $word_only); $x2 = $this->array_count_values_of($wes, $wordp); } } diff --git a/tests/Pest.php b/tests/Pest.php new file mode 100644 index 0000000..89287f1 --- /dev/null +++ b/tests/Pest.php @@ -0,0 +1,45 @@ +extend(Sentiment\Tests\TestCase::class)->in('Feature'); + +/* +|-------------------------------------------------------------------------- +| Expectations +|-------------------------------------------------------------------------- +| +| When you're writing tests, you often need to check that values meet certain conditions. The +| "expect()" function gives you access to a set of "expectations" methods that you can use +| to assert different things. Of course, you may extend the Expectation API at any time. +| +*/ + +expect()->extend('toBeOne', function () { + return $this->toBe(1); +}); + +/* +|-------------------------------------------------------------------------- +| Functions +|-------------------------------------------------------------------------- +| +| While Pest is very powerful out-of-the-box, you may have some testing code specific to your +| project that you don't want to repeat in every file. Here you can also expose helpers as +| global functions to help you to reduce the number of lines of code in your test files. +| +*/ + +function something() +{ + // .. +} diff --git a/tests/TestCase.php b/tests/TestCase.php new file mode 100644 index 0000000..5e9e312 --- /dev/null +++ b/tests/TestCase.php @@ -0,0 +1,10 @@ +analyzer = new Analyzer(); +}); + +describe('Analyzer', function () { + it('can be instantiated', function () { + expect($this->analyzer)->toBeInstanceOf(Analyzer::class); + }); + + it('analyzes positive sentences correctly', function () { + // Create a fresh analyzer instance for this test + $analyzer = new Analyzer(); + + // Test one simple positive sentence + $result = $analyzer->getSentiment('This is great'); + + // Ensure all keys are present + expect($result)->toHaveKeys(['neg', 'neu', 'pos', 'compound']); + + // Positive score should be greater than negative + expect($result['pos'])->toBeGreaterThan($result['neg']); + + // Compound should be positive (> 0 for clearly positive) + expect($result['compound'])->toBeGreaterThan(0); + }); + + it('analyzes negative sentences correctly', function () { + $negatives = [ + 'This is terrible!' => ['neg' => true, 'compound' => true], + 'I hate this' => ['neg' => true, 'compound' => true], + 'Worst experience ever' => ['neg' => true, 'compound' => true], + 'This is awful' => ['neg' => true, 'compound' => true], + 'Completely disappointed' => ['neg' => true, 'compound' => true], + 'Horrible service' => ['neg' => true, 'compound' => true], + ]; + + foreach ($negatives as $text => $expectations) { + $result = $this->analyzer->getSentiment($text); + + // Negative score should be greater than positive + expect($result['neg'])->toBeGreaterThan($result['pos']); + + // Compound should be negative (< -0.05 for clearly negative) + expect($result['compound'])->toBeLessThan(-0.05); + } + }); + + it('analyzes neutral sentences correctly', function () { + $neutrals = [ + 'The sky is blue', + 'Today is Monday', + 'The book is on the table', + 'Water is H2O', + 'The meeting is at 3pm', + ]; + + foreach ($neutrals as $text) { + $result = $this->analyzer->getSentiment($text); + + // Neutral score should be dominant + expect($result['neu'])->toBeGreaterThan(0.5); + + // Compound should be close to 0 (between -0.05 and 0.05) + expect($result['compound'])->toBeBetween(-0.05, 0.05); + } + }); + + it('handles emojis in sentiment analysis', function () { + $textsWithEmojis = [ + 'I love this 😍' => ['positive' => true], + 'So sad 😢' => ['negative' => true], + 'Happy day 😊' => ['positive' => true], + 'Angry 😠' => ['negative' => true], + ]; + + foreach ($textsWithEmojis as $text => $expectation) { + $result = $this->analyzer->getSentiment($text); + + if ($expectation['positive'] ?? false) { + expect($result['compound'])->toBeGreaterThan(0); + } + if ($expectation['negative'] ?? false) { + expect($result['compound'])->toBeLessThan(0); + } + } + }); + + it('handles negation correctly', function () { + // Create a fresh analyzer instance + $analyzer = new Analyzer(); + + // Test basic negation + $result = $analyzer->getSentiment('not good'); + + // "not good" should be negative + expect($result['compound'])->toBeLessThanOrEqual(0); + }); + + it('handles emphasis with punctuation', function () { + // Multiple exclamation marks should amplify sentiment + $regular = $this->analyzer->getSentiment('This is good'); + $emphasized = $this->analyzer->getSentiment('This is good!!!'); + + // Emphasized should have stronger positive sentiment + expect(abs($emphasized['compound']))->toBeGreaterThan(abs($regular['compound'])); + + // Question marks can also affect sentiment + $question = $this->analyzer->getSentiment('This is good???'); + expect($question)->toHaveKeys(['neg', 'neu', 'pos', 'compound']); + }); + + it('handles all caps for emphasis', function () { + $regular = $this->analyzer->getSentiment('this is amazing'); + $allCaps = $this->analyzer->getSentiment('THIS IS AMAZING'); + + // All caps should amplify the sentiment + expect(abs($allCaps['compound']))->toBeGreaterThanOrEqual(abs($regular['compound'])); + }); + + it('handles BUT conjunction correctly', function () { + // Sentiment after BUT should be weighted more heavily + $result = $this->analyzer->getSentiment('The food was great but the service was terrible'); + + // Should lean negative because negative part comes after BUT + expect($result['compound'])->toBeLessThan(0); + + // Reverse case + $result2 = $this->analyzer->getSentiment('The service was terrible but the food was great'); + + // Should lean positive because positive part comes after BUT + expect($result2['compound'])->toBeGreaterThan(0); + }); + + it('returns consistent score structure', function () { + $result = $this->analyzer->getSentiment('Test sentence'); + + // Check all required keys exist + expect($result)->toHaveKeys(['neg', 'neu', 'pos', 'compound']); + + // Check all values are numeric + expect($result['neg'])->toBeNumeric(); + expect($result['neu'])->toBeNumeric(); + expect($result['pos'])->toBeNumeric(); + expect($result['compound'])->toBeNumeric(); + + // Check scores are normalized (sum to approximately 1) + $sum = $result['neg'] + $result['neu'] + $result['pos']; + expect($sum)->toBeBetween(0.999, 1.001); + + // Check compound is between -1 and 1 + expect($result['compound'])->toBeBetween(-1, 1); + }); + + it('handles empty and whitespace strings', function () { + $emptyResult = $this->analyzer->getSentiment(''); + expect($emptyResult['compound'])->toBe(0.0); + expect($emptyResult['neg'])->toBe(0.0); + expect($emptyResult['pos'])->toBe(0.0); + expect($emptyResult['neu'])->toBe(0.0); + + $whitespaceResult = $this->analyzer->getSentiment(' '); + expect($whitespaceResult['compound'])->toBe(0.0); + }); + + it('can update lexicon with custom words', function () { + // Add custom positive word + $this->analyzer->updateLexicon(['awesomesauce' => 3.0]); + + $result = $this->analyzer->getSentiment('This is awesomesauce'); + expect($result['compound'])->toBeGreaterThan(0); + + // Add custom negative word + $this->analyzer->updateLexicon(['terribleawful' => -3.0]); + + $result2 = $this->analyzer->getSentiment('This is terribleawful'); + expect($result2['compound'])->toBeLessThan(0); + }); + + it('detects negation with IsNegated method', function () { + expect($this->analyzer->IsNegated('not'))->toBeTrue(); + expect($this->analyzer->IsNegated('never'))->toBeTrue(); + expect($this->analyzer->IsNegated("isn't"))->toBeTrue(); + expect($this->analyzer->IsNegated("wouldn't"))->toBeTrue(); + expect($this->analyzer->IsNegated('happy'))->toBeFalse(); + // 'no' is not in the NEGATE array, so removed that test + }); +}); diff --git a/tests/Unit/ConfigTest.php b/tests/Unit/ConfigTest.php new file mode 100644 index 0000000..321252a --- /dev/null +++ b/tests/Unit/ConfigTest.php @@ -0,0 +1,138 @@ +toBeArray(); + expect(Config::BOOSTER_DICT)->toBeArray(); + expect(Config::SPECIAL_CASE_IDIOMS)->toBeArray(); + expect(Config::SENTIMENT_LADEN_IDIOMS)->toBeArray(); + }); + + it('has correct incremental values', function () { + expect(Config::B_INCR)->toBeNumeric(); + expect(Config::B_DECR)->toBeNumeric(); + expect(Config::C_INCR)->toBeNumeric(); + expect(Config::N_SCALAR)->toBeNumeric(); + + // B_INCR should be positive + expect(Config::B_INCR)->toBeGreaterThan(0); + // B_DECR should be negative + expect(Config::B_DECR)->toBeLessThan(0); + }); + + it('contains expected negation words', function () { + $expectedNegations = ['not', 'never', 'neither', 'nowhere', 'nothing', 'none', 'without']; + + foreach ($expectedNegations as $word) { + expect(Config::NEGATE)->toContain($word); + } + }); + + it('has booster words with correct values', function () { + // Check some known booster words + expect(Config::BOOSTER_DICT)->toHaveKey('absolutely'); + expect(Config::BOOSTER_DICT['absolutely'])->toBeNumeric(); + + expect(Config::BOOSTER_DICT)->toHaveKey('very'); + expect(Config::BOOSTER_DICT['very'])->toBeNumeric(); + + expect(Config::BOOSTER_DICT)->toHaveKey('slightly'); + expect(Config::BOOSTER_DICT['slightly'])->toBeNumeric(); + + // Intensifiers should have positive values + expect(Config::BOOSTER_DICT['absolutely'])->toBeGreaterThan(0); + expect(Config::BOOSTER_DICT['very'])->toBeGreaterThan(0); + + // Diminishers should have negative values + expect(Config::BOOSTER_DICT['slightly'])->toBeLessThan(0); + }); + + it('has special case idioms with sentiment values', function () { + // Check some known idioms + expect(Config::SPECIAL_CASE_IDIOMS)->toHaveKey('the shit'); + expect(Config::SPECIAL_CASE_IDIOMS)->toHaveKey('the bomb'); + expect(Config::SPECIAL_CASE_IDIOMS)->toHaveKey('bad ass'); + + // These should have numeric sentiment values + foreach (Config::SPECIAL_CASE_IDIOMS as $idiom => $value) { + expect($value)->toBeNumeric(); + } + }); + + it('has sentiment laden idioms with values', function () { + // Check structure + expect(Config::SENTIMENT_LADEN_IDIOMS)->toBeArray(); + + // Check some known idioms + expect(Config::SENTIMENT_LADEN_IDIOMS)->toHaveKey('cut the mustard'); + expect(Config::SENTIMENT_LADEN_IDIOMS)->toHaveKey('on the ball'); + + // All values should be numeric + foreach (Config::SENTIMENT_LADEN_IDIOMS as $idiom => $value) { + expect($value)->toBeNumeric(); + } + }); + + it('normalizes scores correctly', function () { + // Test with different scores + $testCases = [ + ['score' => 0, 'expected' => 0.0], + ['score' => 5, 'alpha' => 15, 'min' => 0.7, 'max' => 0.8], + ['score' => -5, 'alpha' => 15, 'min' => -0.8, 'max' => -0.7], + ['score' => 15, 'alpha' => 15, 'min' => 0.96, 'max' => 0.98], + ['score' => -15, 'alpha' => 15, 'min' => -0.98, 'max' => -0.96], + ]; + + foreach ($testCases as $test) { + $score = $test['score']; + $alpha = $test['alpha'] ?? 15; + $result = Config::normalize($score, $alpha); + + // Result should be between -1 and 1 + expect($result)->toBeBetween(-1, 1); + + // Check expected value or range + if (isset($test['expected'])) { + expect($result)->toBe($test['expected']); + } elseif (isset($test['min']) && isset($test['max'])) { + expect($result)->toBeBetween($test['min'], $test['max']); + } + } + }); + + it('normalize function handles edge cases', function () { + // Very large positive score should approach 1 + $largePositive = Config::normalize(1000, 15); + expect($largePositive)->toBeLessThan(1); + expect($largePositive)->toBeGreaterThan(0.95); + + // Very large negative score should approach -1 + $largeNegative = Config::normalize(-1000, 15); + expect($largeNegative)->toBeGreaterThan(-1); + expect($largeNegative)->toBeLessThan(-0.95); + + // Zero should return zero + expect(Config::normalize(0))->toBe(0.0); + }); + + it('normalize function with different alpha values', function () { + $score = 10; + + // Smaller alpha makes normalization more aggressive + $smallAlpha = Config::normalize($score, 5); + $normalAlpha = Config::normalize($score, 15); + $largeAlpha = Config::normalize($score, 50); + + // With same score, smaller alpha should give larger normalized value + expect($smallAlpha)->toBeGreaterThan($normalAlpha); + expect($normalAlpha)->toBeGreaterThan($largeAlpha); + + // All should still be between -1 and 1 + expect($smallAlpha)->toBeBetween(0, 1); + expect($normalAlpha)->toBeBetween(0, 1); + expect($largeAlpha)->toBeBetween(0, 1); + }); +}); diff --git a/tests/Unit/SentiTextTest.php b/tests/Unit/SentiTextTest.php new file mode 100644 index 0000000..5591b19 --- /dev/null +++ b/tests/Unit/SentiTextTest.php @@ -0,0 +1,159 @@ +toBeInstanceOf(SentiText::class); + }); + + it('extracts words and emoticons correctly', function () { + $sentiText = new SentiText('Hello world :) How are you?'); + expect($sentiText->words_and_emoticons)->toBeArray(); + expect($sentiText->words_and_emoticons)->toContain('Hello'); + expect($sentiText->words_and_emoticons)->toContain('world'); + expect($sentiText->words_and_emoticons)->toContain(':)'); + }); + + it('strips punctuation correctly', function () { + $sentiText = new SentiText('test'); + + $result = $sentiText->strip_punctuation('Hello, world! How are you?'); + expect($result)->toBe('Hello world How are you'); + + $result = $sentiText->strip_punctuation('Test... with... dots...'); + expect($result)->toBe('Test with dots'); + + $result = $sentiText->strip_punctuation('No punctuation here'); + expect($result)->toBe('No punctuation here'); + }); + + it('counts array values correctly', function () { + $sentiText = new SentiText('test'); + + $haystack = ['apple', 'banana', 'apple', 'cherry', 'apple']; + + expect($sentiText->array_count_values_of($haystack, 'apple'))->toBe(3); + expect($sentiText->array_count_values_of($haystack, 'banana'))->toBe(1); + expect($sentiText->array_count_values_of($haystack, 'cherry'))->toBe(1); + expect($sentiText->array_count_values_of($haystack, 'orange'))->toBe(0); + }); + + it('detects capitalization differential', function () { + // Test through the public property instead of private method + + // All lowercase - no differential + $sentiText = new SentiText('hello world test'); + expect($sentiText->is_cap_diff)->toBeFalse(); + + // All uppercase - no differential + $sentiText = new SentiText('HELLO WORLD TEST'); + expect($sentiText->is_cap_diff)->toBeFalse(); + + // Mixed case - has differential + $sentiText = new SentiText('HELLO world TEST'); + expect($sentiText->is_cap_diff)->toBeTrue(); + + // One uppercase among lowercase - has differential + $sentiText = new SentiText('hello WORLD test'); + expect($sentiText->is_cap_diff)->toBeTrue(); + }); + + it('sets is_cap_diff property correctly', function () { + // All lowercase + $sentiText = new SentiText('hello world test'); + expect($sentiText->is_cap_diff)->toBeFalse(); + + // All uppercase + $sentiText = new SentiText('HELLO WORLD TEST'); + expect($sentiText->is_cap_diff)->toBeFalse(); + + // Mixed case + $sentiText = new SentiText('HELLO world TEST'); + expect($sentiText->is_cap_diff)->toBeTrue(); + }); + + it('handles punctuation list correctly', function () { + expect(SentiText::PUNC_LIST)->toBeArray(); + expect(SentiText::PUNC_LIST)->toContain('.'); + expect(SentiText::PUNC_LIST)->toContain('!'); + expect(SentiText::PUNC_LIST)->toContain('?'); + expect(SentiText::PUNC_LIST)->toContain('!!!'); + expect(SentiText::PUNC_LIST)->toContain('???'); + }); + + it('preserves emoticons when extracting words', function () { + $emoticons = [':)', ':(', ':D', ';)', ':/', ':P']; + $text = 'Hello :) this is good'; + + $sentiText = new SentiText($text); + + // Check that emoticons are preserved (though they might be part of words) + expect($sentiText->words_and_emoticons)->toContain(':)'); + }); + + it('handles contractions correctly', function () { + $contractions = [ + "don't" => "don't", + "won't" => "won't", + "can't" => "can't", + "wouldn't" => "wouldn't", + ]; + + foreach ($contractions as $input => $expected) { + $sentiText = new SentiText($input); + expect($sentiText->words_and_emoticons)->toContain($expected); + } + }); + + it('filters out single letter words', function () { + $text = 'I a test of single letters x y z'; + $sentiText = new SentiText($text); + + // Single letters should be filtered out + expect($sentiText->words_and_emoticons)->not->toContain('I'); + expect($sentiText->words_and_emoticons)->not->toContain('a'); + expect($sentiText->words_and_emoticons)->not->toContain('x'); + expect($sentiText->words_and_emoticons)->not->toContain('y'); + expect($sentiText->words_and_emoticons)->not->toContain('z'); + + // Multi-letter words should remain + expect($sentiText->words_and_emoticons)->toContain('test'); + expect($sentiText->words_and_emoticons)->toContain('of'); + expect($sentiText->words_and_emoticons)->toContain('single'); + expect($sentiText->words_and_emoticons)->toContain('letters'); + }); + + it('handles empty and whitespace text', function () { + $emptyText = new SentiText(''); + expect($emptyText->words_and_emoticons)->toBeArray(); + expect($emptyText->words_and_emoticons)->toBeEmpty(); + + $whitespaceText = new SentiText(' '); + expect($whitespaceText->words_and_emoticons)->toBeArray(); + expect($whitespaceText->words_and_emoticons)->toBeEmpty(); + }); + + it('handles text with multiple punctuation marks', function () { + $text = 'Wow!!! Really??? That is good'; + $sentiText = new SentiText($text); + + expect($sentiText->words_and_emoticons)->toContain('Wow'); + expect($sentiText->words_and_emoticons)->toContain('Really'); + expect($sentiText->words_and_emoticons)->toContain('That'); + expect($sentiText->words_and_emoticons)->toContain('is'); + expect($sentiText->words_and_emoticons)->toContain('good'); + }); + + it('preserves word order in words_and_emoticons', function () { + $text = 'First second third fourth'; + $sentiText = new SentiText($text); + + $words = $sentiText->words_and_emoticons; + expect($words[0])->toBe('First'); + expect($words[1])->toBe('second'); + expect($words[2])->toBe('third'); + expect($words[3])->toBe('fourth'); + }); +});