| |
php-text-statistics |
Hello all,
I've translated the php-text-statistics package to Ruby, you can view
Regards,
############### Code
module ReadabilityIndices
class Readability
NumDecimalPlaces = 1
Titles = SequencedHash.new
attr_accessor :text
def valid_index?(index)
def flesch_kincaid_grade_level
def flesch_kincaid_reading_ease
def gunning_fog_score
def coleman_liau_index
def smog_index
def automated_readability_index
Colon = ": "
# private
def round(num, decimals)
def letter_count
def sentence_count
def word_count
def get_words
def average_words_per_sentence
def average_syllables_per_word
def total_syllables
def words_with_three_syllables(count_proper_nouns = true)
def percentage_words_with_three_syllables(count_proper_nouns =
ProblemWords = {
MultiSyllablesThatAreOne = [
UniSyllablesThatAreTwo = [
PrefixesAndSuffixes = [
def syllable_count(word)
#find number and delete prefixes and suffixes
#remove non-word chars
#count word parts:
#subtract out syllables that are really one:
#add syllables that are really two:
return [1, num_syllables].max
############### RSpec tests
describe "readability indices" do
it "should count simple syllable words correctly" do
it "should count syllables on programmed exceptions" do
it "should count complex syllable words correctly" do
it "should calculate average syllables per word" do
it "should count words correctly" do
it "should get percentage of words with three syllables" do
it "should count letters" do
it "should count sentences" do
it "should calculate average words per sentence" do
describe "test indices directly" do
it "should calculate flesch-kincaid reading ease" do
it "should calculate flesch-kincaid grade level" do
it "should calculate Gunning-Fog Score" do
it "should calculate coleman-liau index" do
it "should calculate smog index" do
it "should calculate automated readability index" do
it "should index first paragraph of Moby Dick correctly" do
readability = Readability.new(str)
readability.letter_count.should == 884
readability.flesch_kincaid_grade_level.should == 12.1
it "should index a Kipling poem correctly" do
If you can dream - and not make dreams your master,
If you can make one heap of all your winnings
If you can talk with crowds and keep your virtue,
readability = Readability.new(str)
readability.letter_count.should == 1125
readability.flesch_kincaid_grade_level.should == 111.9
the files below. Please note I couldn't get the Gunning Fog Score to
work 100%
Adam
require 'collections/sequenced_hash'
Titles[:flesch_kincaid_grade_level] = 'Flesch-Kincaid Grade level'
Titles[:flesch_kincaid_reading_ease] = 'Flesch-Kincaid Reading
Ease'
Titles[:gunning_fog_score] = 'Gunning-Fog score'
Titles[:coleman_liau_index] = 'Coleman-Liau Index'
Titles[:smog_index] = 'SMOG Index'
Titles[:automated_readability_index] = 'Automated Readability
Index'
def initialize(text = '')
self.text = clean_text(text)
end
Titles[index] ? true : false
end
round(0.39 * average_words_per_sentence + 11.8 *
average_syllables_per_word - 15.59, NumDecimalPlaces)
end
round(206.835 - 1.015 * average_words_per_sentence - 84.6 *
average_syllables_per_word, NumDecimalPlaces)
end
round((average_words_per_sentence +
percentage_words_with_three_syllables(false)) * 0.4, NumDecimalPlaces)
end
round(5.89 * letter_count / word_count - 0.3 * sentence_count /
word_count - 15.8, NumDecimalPlaces)
end
round(1.043 * Math.sqrt((words_with_three_syllables * (30 /
sentence_count)) + 3.1291), NumDecimalPlaces)
end
round(4.71 * letter_count / word_count + 0.5 * word_count /
sentence_count - 21.43, NumDecimalPlaces)
end
Separator = ", "
def get_indices_as_string(indices = [], diagnostics = true)
indices = (indices.empty? ? Titles.keys : indices)
str = indices.inject([]){|arr, index| arr << "#{Titles[index]}#
{Colon} #{self.send(index)}"; arr}.join(Separator)
return diagnostics ? "words#{Colon} #{word_count}#{Separator}
sentences#{Colon} #{sentence_count}#{Separator} characters#{Colon} #
{letter_count}#{Separator}" + str : str
end
def clean_text(text)
text.gsub!(/[,:;()-]/, ' ') # Replace commans, hyphens etc
(count them as spaces)
text.gsub!(/[\.!?]/, '.') # Unify terminators
text = text.strip + '.' # Add final terminator, just in case
it's missing.
text.gsub!(/[ ]*(\n|\r\n|\r)[ ]*/, ' ') # Replace new lines with
spaces
text.gsub!(/([\.])[\.\s?]+/, ".") # Check for duplicated
terminators
text.gsub!(/[ ]*([\.])/, "#{$1} ") # Pad sentence terminators
text.gsub!(/[ ]+/, ' ') # Remove multiple spaces
#$strText = preg_replace_callback('/\. [^ ]+/', create_function
('$matches', 'return strtolower($matches[0]);'), $strText); // Lower
case all words following terminators (for gunning fog score)
return text.strip
end
(num * 10 * decimals).round / (10 * decimals).to_f
end
self.text.gsub(/[^A-Za-z]+/, '').length.to_i
end
[1, self.text.split(/\.!?/).length].max
end
get_words.length
end
@words ||= self.text.split(/\s+/)
end
word_count / sentence_count.to_f
end
total_syllables / get_words.length.to_f
end
get_words.inject(0){|sum, word| sum + syllable_count(word)}
end
get_words.inject(0) do |sum, word|
if syllable_count(word) >= 3
if count_proper_nouns
sum += 1
else
sum += 1 if word[0..0] == word[0..0].downcase
end
end
sum
end
end
true)
words_with_three_syllables(count_proper_nouns) / word_count.to_f
* 100
end
'simile' => 3,
'forever' => 3,
'shoreline' => 2
}
/cial/,
/tia/,
/cius/,
/cious/,
/giu/,
/ion/,
/iou/,
/sia$/,
/[^aeiuoyt]{2,}ed$/,
/.ely$/,
/[cg]h?e[rsd]?$/,
/rved?$/,
/[aeiouy][dt]es?$/,
/[aeiouy][^aeiouydt]e[rsd]?$/,
/^[dr]e[aeiou][^aeiou]+$/, #Sorts out deal, deign etc
/[aeiouy]rse$/ #Purse, hears
]
/ia/,
/riet/,
/dien/,
/iu/,
/io/,
/ii/,
/[aeiouym]bl$/,
/[aeiou]{3}/,
/^mc/,
/ism$/,
/([^aeiouy])\1l$/,
/[^l]lien/,
/^coa[dglx]./,
/[^gq]ua[^auieo]/,
/dnt$/,
/uity$/,
/ie(r|st)$/
]
/^un/,
/^fore/,
/ly$/,
/less$/,
/ful$/,
/ers?$/,
/ings?$/
]
word = word.downcase.strip
#handle problem words first
return ProblemWords[word] if ProblemWords[word]
num_syllables = PrefixesAndSuffixes.inject(0) do |sum, prefix|
word.scan(prefix){sum += 1}
word.gsub!(prefix, '')
sum
end
word.gsub!(/[^a-z]/is, '')
num_syllables += word.split(/[^aeiouy]+/).inject(0){|sum,
word_part| sum + (word_part.blank? ? 0 : 1)}
MultiSyllablesThatAreOne.each{|syl| word.scan(syl){num_syllables
-= 1}}
UniSyllablesThatAreTwo.each{|syl| word.scan(syl){num_syllables
+= 1}}
end
end
end
include ReadabilityIndices
before(:each) do
@readability_blank = Readability.new
end
@readability_blank.syllable_count('a').should == 1
@readability_blank.syllable_count('was').should == 1
@readability_blank.syllable_count('the').should == 1
@readability_blank.syllable_count('and').should == 1
@readability_blank.syllable_count('foobar').should == 2
@readability_blank.syllable_count('hello').should == 2
@readability_blank.syllable_count('world').should == 1
@readability_blank.syllable_count('wonderful').should == 3
@readability_blank.syllable_count('simple').should == 2
@readability_blank.syllable_count('easy').should == 2
@readability_blank.syllable_count('hard').should == 1
@readability_blank.syllable_count('quick').should == 1
@readability_blank.syllable_count('brown').should == 1
@readability_blank.syllable_count('fox').should == 1
@readability_blank.syllable_count('jumped').should == 1
@readability_blank.syllable_count('over').should == 2
@readability_blank.syllable_count('lazy').should == 2
@readability_blank.syllable_count('dog').should == 1
@readability_blank.syllable_count('camera').should == 3
end
@readability_blank.syllable_count('simile').should == 3
@readability_blank.syllable_count('shoreline').should == 2
@readability_blank.syllable_count('forever').should == 3
end
@readability_blank.syllable_count
('antidisestablishmentarianism').should == 12
@readability_blank.syllable_count
('supercalifragilisticexpialidocious').should == 14
@readability_blank.syllable_count
('chlorofluorocarbonation').should == 8
@readability_blank.syllable_count('forethoughtfulness').should
== 4
@readability_blank.syllable_count('phosphorescent').should == 4
@readability_blank.syllable_count('theoretician').should == 5
@readability_blank.syllable_count('promiscuity').should == 5
@readability_blank.syllable_count('unbutlering').should == 4
@readability_blank.syllable_count('continuity').should == 5
@readability_blank.syllable_count('craunched').should == 1
@readability_blank.syllable_count('squelched').should == 1
@readability_blank.syllable_count('scrounge').should == 1
@readability_blank.syllable_count('coughed').should == 1
@readability_blank.syllable_count('smile').should == 1
@readability_blank.syllable_count('monopoly').should == 4
@readability_blank.syllable_count('doughey').should == 2
@readability_blank.syllable_count('doughier').should == 3
@readability_blank.syllable_count('leguminous').should == 4
@readability_blank.syllable_count('thoroughbreds').should == 3
@readability_blank.syllable_count('special').should == 2
@readability_blank.syllable_count('delicious').should == 3
@readability_blank.syllable_count('spatial').should == 2
@readability_blank.syllable_count('pacifism').should == 4
@readability_blank.syllable_count('coagulant').should == 4
@readability_blank.syllable_count('shouldn\'t').should == 2
@readability_blank.syllable_count('mcdonald').should == 3
@readability_blank.syllable_count('audience').should == 3
@readability_blank.syllable_count('finance').should == 2
@readability_blank.syllable_count('prevalence').should == 3
@readability_blank.syllable_count('impropriety').should == 5
@readability_blank.syllable_count('alien').should == 3
@readability_blank.syllable_count('dreadnought').should == 2
@readability_blank.syllable_count('verandah').should == 3
@readability_blank.syllable_count('similar').should == 3
@readability_blank.syllable_count('similarly').should == 4
@readability_blank.syllable_count('central').should == 2
@readability_blank.syllable_count('cyst').should == 1
@readability_blank.syllable_count('term').should == 1
@readability_blank.syllable_count('order').should == 2
@readability_blank.syllable_count('fur').should == 1
@readability_blank.syllable_count('sugar').should == 2
@readability_blank.syllable_count('paper').should == 2
@readability_blank.syllable_count('make').should == 1
@readability_blank.syllable_count('gem').should == 1
@readability_blank.syllable_count('program').should == 2
@readability_blank.syllable_count('hopeless').should == 2
@readability_blank.syllable_count('hopelessly').should == 3
@readability_blank.syllable_count('careful').should == 2
@readability_blank.syllable_count('carefully').should == 3
@readability_blank.syllable_count('stuffy').should == 2
@readability_blank.syllable_count('thistle').should == 2
@readability_blank.syllable_count('teacher').should == 2
@readability_blank.syllable_count('unhappy').should == 3
@readability_blank.syllable_count('ambiguity').should == 5
@readability_blank.syllable_count('validity').should == 4
@readability_blank.syllable_count('ambiguous').should == 4
@readability_blank.syllable_count('deserve').should == 2
@readability_blank.syllable_count('blooper').should == 2
@readability_blank.syllable_count('scooped').should == 1
@readability_blank.syllable_count('deserve').should == 2
@readability_blank.syllable_count('deal').should == 1
@readability_blank.syllable_count('death').should == 1
@readability_blank.syllable_count('dearth').should == 1
@readability_blank.syllable_count('deign').should == 1
@readability_blank.syllable_count('reign').should == 1
@readability_blank.syllable_count('bedsore').should == 2
@readability_blank.syllable_count('anorexia').should == 5
@readability_blank.syllable_count('anymore').should == 3
@readability_blank.syllable_count('cored').should == 1
@readability_blank.syllable_count('sore').should == 1
@readability_blank.syllable_count('foremost').should == 2
@readability_blank.syllable_count('restore').should == 2
@readability_blank.syllable_count('minute').should == 2
@readability_blank.syllable_count('manticores').should == 3
@readability_blank.syllable_count('asparagus').should == 4
@readability_blank.syllable_count('unexplored').should == 3
@readability_blank.syllable_count('unexploded').should == 4
@readability_blank.syllable_count('CAPITALS').should == 3
end
Readability.new('and then there was
one').average_syllables_per_word.should == 1
Readability.new('because special ducklings deserve
rainbows').average_syllables_per_word.should == 2
Readability.new('and then there was one because special
ducklings deserve rainbows').average_syllables_per_word.should ==
1.5
end
Readability.new('The quick brown fox jumped over the lazy
dog').word_count.should == 9
Readability.new('The quick brown fox jumped over the lazy
dog.').word_count.should == 9
Readability.new('The quick brown fox jumped over the lazy dog.
').word_count.should == 9
Readability.new(' The quick brown fox jumped over the lazy dog.
').word_count.should == 9
Readability.new(' The quick brown fox jumped over the lazy dog.
').word_count.should == 9
Readability.new('Yes. No.').word_count.should == 2
Readability.new('Yes.No.').word_count.should == 2
Readability.new('Yes.No.').word_count.should == 2
Readability.new('Yes . No.').word_count.should == 2
Readability.new('Yes - No. ').word_count.should == 2
end
Readability.new('there is just one word with three syllables in
this sentence').percentage_words_with_three_syllables.round.should ==
9
Readability.new('there are no valid words with three Syllables
in this sentence').percentage_words_with_three_syllables.round.should
== 9
Readability.new('there is one and only one word with three or
more syllables in this long boring sentence of twenty
words').percentage_words_with_three_syllables.round.should == 5
Readability.new('there are two and only two words with three or
more syllables in this long sentence of exactly twenty
words').percentage_words_with_three_syllables.round.should == 10
Readability.new('there is Actually only one valid word with
three or more syllables in this long sentence of Exactly twenty
words').percentage_words_with_three_syllables(false).round.should == 5
Readability.new('no long words in this
sentence').percentage_words_with_three_syllables.round.should == 0
Readability.new('no long valid words in this sentence because
the test ignores proper case words like this
Behemoth').percentage_words_with_three_syllables(false).round.should
== 0
end
Readability.new('a').letter_count.should == 1
Readability.new('').letter_count.should == 0
Readability.new('this sentence has 30 characters, not including
the digits').letter_count.should == 46
end
Readability.new('This is a sentence').sentence_count.should == 1
Readability.new('This is a sentence.').sentence_count.should ==
1
Readability.new('This is a sentence!').sentence_count.should ==
1
Readability.new('This is a sentence?').sentence_count.should ==
1
Readability.new('This is a sentence..').sentence_count.should ==
1
Readability.new('This is a sentence. So is
this.').sentence_count.should == 2
Readability.new("This is a sentence. \n\n So is this, but this
is multi-line!").sentence_count.should == 2
Readability.new('This is a sentence,. So is
this.').sentence_count.should == 2
Readability.new('This is a sentence!? So is
this.').sentence_count.should == 2
Readability.new('This is a sentence. So is this. And this one as
well.').sentence_count.should == 3
Readability.new('This is a sentence - but just
one.').sentence_count.should == 1
Readability.new('This is a sentence (but just
one).').sentence_count.should == 1
end
Readability.new('This is a
sentence').average_words_per_sentence.should == 4
Readability.new('This is a
sentence.').average_words_per_sentence.should == 4
Readability.new('This is a sentence.
').average_words_per_sentence.should == 4
Readability.new('This is a sentence. This is a
sentence').average_words_per_sentence.should == 4
Readability.new('This is a sentence. This is a
sentence.').average_words_per_sentence.should == 4
Readability.new('This, is - a sentence . This is a sentence.
').average_words_per_sentence.should == 4
Readability.new('This is a sentence with extra text. This is a
sentence. ').average_words_per_sentence.should == 5.5
Readability.new('This is a sentence with some extra text. This
is a sentence. ').average_words_per_sentence.should == 6
end
before(:each) do
@str_a = 'This. Is. A. Nice. Set. Of. Small. Words. Of. One.
Part. Each.'
@str_b = 'The quick brown fox jumped over the lazy dog.'
@str_c = 'The quick brown fox jumped over the lazy dog. The
quick brown fox jumped over the lazy dog.'
@str_d = "The quick brown fox jumped over the lazy dog. \n\n
The quick brown fox jumped over the lazy dog."
@str_e = 'The quick brown fox jumped over the lazy dog. The
quick brown fox jumped over the lazy dog'
@str_f = 'Now it is time for a more complicated sentence,
including several longer words.'
@str_g = 'Now it is time for a more Complicated sentence,
including Several longer words.'
end
Readability.new(@str_a).flesch_kincaid_reading_ease.should ==
121.2
Readability.new(@str_b).flesch_kincaid_reading_ease.should ==
94.3
Readability.new(@str_c).flesch_kincaid_reading_ease.should ==
94.3
Readability.new(@str_d).flesch_kincaid_reading_ease.should ==
94.3
Readability.new(@str_e).flesch_kincaid_reading_ease.should ==
94.3
Readability.new(@str_f).flesch_kincaid_reading_ease.should ==
50.5
end
Readability.new(@str_a).flesch_kincaid_grade_level.should ==
-3.4
Readability.new(@str_b).flesch_kincaid_grade_level.should ==
2.3
Readability.new(@str_c).flesch_kincaid_grade_level.should ==
2.3
Readability.new(@str_d).flesch_kincaid_grade_level.should ==
2.3
Readability.new(@str_e).flesch_kincaid_grade_level.should ==
2.3
Readability.new(@str_f).flesch_kincaid_grade_level.should ==
9.4
end
Readability.new(@str_a).gunning_fog_score.should == 0.4
Readability.new(@str_b).gunning_fog_score.should == 3.6
Readability.new(@str_c).gunning_fog_score.should == 3.6
Readability.new(@str_d).gunning_fog_score.should == 3.6
Readability.new(@str_e).gunning_fog_score.should == 3.6
Readability.new(@str_f).gunning_fog_score.should == 14.4
Readability.new(@str_g).gunning_fog_score.should == 8.3
end
Readability.new(@str_a).coleman_liau_index.should == 3.0
Readability.new(@str_b).coleman_liau_index.should == 7.7
Readability.new(@str_c).coleman_liau_index.should == 7.7
Readability.new(@str_d).coleman_liau_index.should == 7.7
Readability.new(@str_e).coleman_liau_index.should == 7.7
Readability.new(@str_f).coleman_liau_index.should ==
13.6
end
Readability.new(@str_a).smog_index.should == 1.8
Readability.new(@str_b).smog_index.should == 1.8
Readability.new(@str_c).smog_index.should == 1.8
Readability.new(@str_d).smog_index.should == 1.8
Readability.new(@str_e).smog_index.should == 1.8
Readability.new(@str_f).smog_index.should ==
10.1
end
Readability.new(@str_a).automated_readability_index.should ==
-5.6
Readability.new(@str_b).automated_readability_index.should ==
1.9
Readability.new(@str_c).automated_readability_index.should ==
1.9
Readability.new(@str_d).automated_readability_index.should ==
1.9
Readability.new(@str_e).automated_readability_index.should ==
1.9
Readability.new(@str_f).automated_readability_index.should ==
8.6
end
str =<<-ENDL
Call me Ishmael. Some years ago - never mind how long
precisely - having little or no money in my purse, and
nothing particular to interest me on shore, I thought I
would sail about a little and see the watery part of
the world. It is a way I have of driving off the spleen, and
regulating the circulation. Whenever I find myself
growing grim about the mouth; whenever it is a damp, drizzly
November in my soul; whenever I find myself
involuntarily pausing before coffin warehouses, and bringing
up the rear of every funeral I meet; and especially
whenever my hypos get such an upper hand of me, that it
requires a strong moral principle to prevent me from
deliberately stepping into the street, and methodically
knocking people's hats off - then, I account it high time
to get to sea as soon as I can. This is my substitute for
pistol and ball. With a philosophical flourish Cato
throws himself upon his sword; I quietly take to the ship.
There is nothing surprising in this. If they but knew
it, almost all men in their degree, some time or other,
cherish very nearly the same feelings towards the ocean with me.
ENDL
readability.word_count.should == 201
readability.total_syllables.should == 304
readability.sentence_count.should == 8
readability.words_with_three_syllables.should == 23
readability.flesch_kincaid_reading_ease.should == 53.4
readability.gunning_fog_score.should == 14.2
readability.coleman_liau_index.should == 10.1
readability.smog_index.should == 8.9
readability.automated_readability_index.should == 11.8
end
str =<<-ENDL
If you can keep your head when all about you
Are losing theirs and blaming it on you,
If you can trust yourself when all men doubt you
But make allowance for their doubting too,
If you can wait and not be tired by waiting,
Or being lied about, don't deal in lies,
Or being hated, don't give way to hating,
And yet don't look too good, nor talk too wise:
If you can think - and not make thoughts your aim;
If you can meet with Triumph and Disaster
And treat those two impostors just the same;
If you can bear to hear the truth you've spoken
Twisted by knaves to make a trap for fools,
Or watch the things you gave your life to, broken,
And stoop and build 'em up with worn-out tools:
And risk it all on one turn of pitch-and-toss,
And lose, and start again at your beginnings
And never breath a word about your loss;
If you can force your heart and nerve and sinew
To serve your turn long after they are gone,
And so hold on when there is nothing in you
Except the Will which says to them: "Hold on"
Or walk with kings - nor lose the common touch,
If neither foes nor loving friends can hurt you;
If all men count with you, but none too much,
If you can fill the unforgiving minute
With sixty seconds' worth of distance run,
Yours is the Earth and everything that's in it,
And - which is more - you'll be a Man, my son
ENDL
readability.word_count.should == 292
readability.total_syllables.should == 338
readability.sentence_count.should == 1
readability.words_with_three_syllables.should == 6
readability.flesch_kincaid_reading_ease.should == -187.5
readability.gunning_fog_score.should == 117.5
readability.coleman_liau_index.should == 6.9
readability.smog_index.should == 14.1
readability.automated_readability_index.should == 142.7
end
end
end