You can apply Natural language parser to build a list
of keywords characterizing a topic of document.
Keyword builder online is a web implementation of the algorithm.
Keyword extraction and analysis is extensively used in information retrieval (in
search algorithms) and document clustering (automatic document summarization).
Words in an Utterance
are syntactically non-equal. Subject, verb or object are more important for
understanding a phrase, while adjective or adverb modifieres usually may be
omitted or play a helper role by adding more information to the core meaning.
This fact may be used when calculating importance of Word
for the text.
Part of speech also affects Word score. For
example, auxiliary verbs can not describe the document theme. Proper nouns are
more important than common nouns because they are more unique.
Keywords as a characteristic of a theme, do not make
much sense for a multi-topic texts like books, stories or encyclopedias.
Multi-topic texts are typically larger than single-topic texts. Normally they
are organized in chapters or articles, which are single-topic. If it is not
known in advance whether text is supposed to be mono-topic, the keywords technique may be
adjusted. Calculate a list of keywords for a text-window and move this window over a
multi-topic text. If set of key-words has changed, it is an indication that
topic has changed.
Below is a simple algorithm of building keyword list for a mono-topic text.
You can copy-paste, compile and execute the code in Visual Studio.
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
// reference to NlpLib.dll is required
using Nlp4Net.NlpLib;
namespace keywords
{
internal class CWeightCompararer : IComparer<KeyValuePair<string, double>>
{
internal static CWeightCompararer Comparer = new CWeightCompararer();
#region IComparer<KeyValuePair<string,float>> Members
public int Compare(KeyValuePair<string, double> x, KeyValuePair<string, double> y)
{
//descending
return y.Value.CompareTo(x.Value);
}
#endregion
}
class Program
{
static void Main(string[] args)
{
if (0 == args.Length)
{
Console.WriteLine("usage: keywords.exe <fileName> [use verbs:verbs(default),nouns] [number of keywords:10(default)] [encoding:utf-8(default),asci,utf-16]");
Console.WriteLine("examples:\r\n");
Console.WriteLine(@"keywords.exe c:\text.txt");
Console.WriteLine(@"keywords.exe c:\text.txt verbs 20");
Console.WriteLine(@"keywords.exe c:\text.txt nouns 10 ASCI");
return;
}
string szFile = args[0];
if (!File.Exists(szFile))
{
Console.WriteLine("Cannot find file: " + szFile);
return;
}
int intMaxKewWords = (args.Length > 2) ? int.Parse(args[2]) : 10;
bool blnUseVerbs = (args.Length > 1) && ("verbs" == args[1].ToLower());
// is encoding specified explicitly?
Encoding encoding = (args.Length > 3) ? Encoding.GetEncoding(args[3]) : Encoding.UTF8;
NLParser parser = new NLParser();
SortedList<string, double> lstKeyWords = new SortedList<string, double>(StringComparer.InvariantCultureIgnoreCase);
long lngSyntaxes = 0;
long lngUtterances = 0;
// score words in a text
foreach (Utterance utterance in parser.Text<Utterance>(szFile, encoding))
{
lngUtterances++;
if ((null != utterance.Syntaxes) && (0 != utterance.Syntaxes.Length))
{
double fWeight = 1.0 / utterance.Syntaxes.Length;
foreach (SyntaxNode syntax in utterance.Syntaxes)
{
lngSyntaxes++;
syntax.ForEach(delegate(SyntaxNode child)
{
switch (child.NodeRole)
{
case RKRole.subject: // The *king gave Anne Boleyn his love
case RKRole.verb: // The king *gave Anne Boleyn his love
case RKRole.directObject: // The king gave Anne Boleyn his *love
case RKRole.indirectObject: // The king gave *Anne *Boleyn his love
case RKRole.subjectComplementNoun: // The king is *Henry VIII.
foreach (Word word in child.Words)
{
double fAddWeight = 0;
// only want verb and noun
// pronoun cannot be a key word because it substitutes the noun
switch (word.POS)
{
case PartOfSpeech.verb:
if (blnUseVerbs)
{
// auxiliary verbs are not interesting
if (!word.HasAny(SyntaxTag.auxilary_verb))
fAddWeight = fWeight;
}
break;
case PartOfSpeech.noun:
fAddWeight = fWeight;
// proper nouns are better pretendets for key words
if (word.HasAny(SyntaxTag.proper))
fAddWeight *= 2;
break;
}
if (0 != fAddWeight)
{
if (!lstKeyWords.ContainsKey(word.Text))
{
lstKeyWords[word.Text] = fAddWeight;
}
else
{
lstKeyWords[word.Text] = lstKeyWords[word.Text] + fAddWeight;
}
}
}
break;
}
});
}
}
}
// slect words with the highest score
List<KeyValuePair<string, double>> lstBestKeyWords = new List<KeyValuePair<string, double>>(intMaxKewWords + 1);
foreach (KeyValuePair<string, double> kp in lstKeyWords)
{
int intIndex = lstBestKeyWords.BinarySearch(kp, CWeightCompararer.Comparer);
if (intIndex < 0)
intIndex = ~intIndex;
lstBestKeyWords.Insert(intIndex, kp);
if (lstBestKeyWords.Count > intMaxKewWords)
lstBestKeyWords.RemoveAt(lstBestKeyWords.Count - 1);
}
// display the list of keywords
Console.WriteLine("File: " + szFile);
Console.WriteLine("Allow verbs: " + blnUseVerbs);
Console.WriteLine(string.Format("Utterances: {0}, diagrams: {1}", lngUtterances, lngSyntaxes));
Console.WriteLine("Key words:\r\n");
Console.ForegroundColor = ConsoleColor.Green;
foreach (KeyValuePair<string, double> kp in lstBestKeyWords)
{
Console.WriteLine(kp.Key + " " + kp.Value);
}
}
}
}