How to build a list of keywords for a topic.

Article explains how to build a list of keywords.

You can apply Natural language parser to build a list of keywords characterizing a topic of document.

Keyword builder online is a web implementation of the algorithm.

Keyword extraction and analysis is extensively used in information retrieval (in search algorithms) and document clustering (automatic document summarization).

Words in an Utterance are syntactically non-equal. Subject, verb or object are more important for understanding a phrase, while adjective or adverb modifieres usually may be omitted or play a helper role by adding more information to the core meaning. This fact may be used when calculating importance of Word for the text.

Part of speech also affects Word score. For example, auxiliary verbs can not describe the document theme. Proper nouns are more important than common nouns because they are more unique.

Keywords as a characteristic of a theme, do not make much sense for a multi-topic texts like books, stories or encyclopedias. Multi-topic texts are typically larger than single-topic texts. Normally they are organized in chapters or articles, which are single-topic. If it is not known in advance whether text is supposed to be mono-topic, the keywords technique may be adjusted. Calculate a list of keywords for a text-window and move this window over a multi-topic text. If set of key-words has changed, it is an indication that topic has changed.

Below is a simple algorithm of building keyword list for a mono-topic text. You can copy-paste, compile and execute the code in Visual Studio.

using System;
using System.Collections.Generic;
using System.Text;
using System.IO;

// reference to NlpLib.dll is required
using Nlp4Net.NlpLib;

namespace keywords
{
    internal class CWeightCompararer : IComparer<KeyValuePair<string, double>>
    {

        internal static CWeightCompararer Comparer = new CWeightCompararer();
        #region IComparer<KeyValuePair<string,float>> Members

        public int Compare(KeyValuePair<string, double> x, KeyValuePair<string, double> y)
        {
            //descending
            return y.Value.CompareTo(x.Value);
        }

        #endregion
    }

    class Program
    {
        static void Main(string[] args)
        {
            if (0 == args.Length)
            {
                Console.WriteLine("usage: keywords.exe <fileName> [use verbs:verbs(default),nouns] [number of keywords:10(default)]  [encoding:utf-8(default),asci,utf-16]");
                Console.WriteLine("examples:\r\n");
                Console.WriteLine(@"keywords.exe c:\text.txt");
                Console.WriteLine(@"keywords.exe c:\text.txt verbs 20");
                Console.WriteLine(@"keywords.exe c:\text.txt nouns 10 ASCI");
                return;
            }

            string szFile = args[0];

            if (!File.Exists(szFile))
            {
                Console.WriteLine("Cannot find file: " + szFile);
                return;
            }

            int intMaxKewWords = (args.Length > 2) ? int.Parse(args[2]) : 10;

            bool blnUseVerbs = (args.Length > 1) && ("verbs" == args[1].ToLower());

            // is encoding specified explicitly?
            Encoding encoding = (args.Length > 3) ? Encoding.GetEncoding(args[3]) : Encoding.UTF8;


            NLParser parser = new NLParser();
            SortedList<string, double> lstKeyWords = new SortedList<string, double>(StringComparer.InvariantCultureIgnoreCase);

            long lngSyntaxes = 0;
            long lngUtterances = 0;
            // score words in a text
            foreach (Utterance utterance in parser.Text<Utterance>(szFile, encoding))
            {
                lngUtterances++;
                if ((null != utterance.Syntaxes) && (0 != utterance.Syntaxes.Length))
                {
                    double fWeight = 1.0 / utterance.Syntaxes.Length;
                    foreach (SyntaxNode syntax in utterance.Syntaxes)
                    {
                        lngSyntaxes++;
                        syntax.ForEach(delegate(SyntaxNode child)
                        {
                            switch (child.NodeRole)
                            {
                                case RKRole.subject: // The *king gave Anne Boleyn his love
                                case RKRole.verb:    // The king *gave Anne Boleyn his love
                                case RKRole.directObject: // The king gave Anne Boleyn his *love
                                case RKRole.indirectObject: // The king gave *Anne *Boleyn his love
                                case RKRole.subjectComplementNoun: // The king is *Henry VIII.
                                    foreach (Word word in child.Words)
                                    {
                                        double fAddWeight = 0;
                                        // only want verb and noun
                                        // pronoun cannot be a key word because it substitutes the noun
                                        switch (word.POS)
                                        {

                                            case PartOfSpeech.verb:
                                                
                                                if (blnUseVerbs)
                                                {
                                                    // auxiliary verbs are not interesting
                                                    if (!word.HasAny(SyntaxTag.auxilary_verb))
                                                        fAddWeight = fWeight;
                                                }
                                                break;
                                            case PartOfSpeech.noun:
                                                fAddWeight = fWeight;

                                                // proper nouns are better pretendets for key words
                                                if (word.HasAny(SyntaxTag.proper))
                                                    fAddWeight *= 2;

                                                break;
                                        }

                                        if (0 != fAddWeight)
                                        {
                                            if (!lstKeyWords.ContainsKey(word.Text))
                                            {
                                                lstKeyWords[word.Text] = fAddWeight;
                                            }
                                            else
                                            {
                                                lstKeyWords[word.Text] = lstKeyWords[word.Text] + fAddWeight;
                                            }
                                        }

                                    }
                                    break;
                            }

                        });
                    }
                }

            }

            // slect words with the highest score
            List<KeyValuePair<string, double>> lstBestKeyWords = new List<KeyValuePair<string, double>>(intMaxKewWords + 1);
            foreach (KeyValuePair<string, double> kp in lstKeyWords)
            {
                int intIndex = lstBestKeyWords.BinarySearch(kp, CWeightCompararer.Comparer);

                if (intIndex < 0)
                    intIndex = ~intIndex;

                lstBestKeyWords.Insert(intIndex, kp);
                if (lstBestKeyWords.Count > intMaxKewWords)
                    lstBestKeyWords.RemoveAt(lstBestKeyWords.Count - 1);
            }

            // display the list of keywords
            Console.WriteLine("File: " + szFile);
            Console.WriteLine("Allow verbs: " + blnUseVerbs);
            Console.WriteLine(string.Format("Utterances: {0}, diagrams: {1}", lngUtterances, lngSyntaxes));

            Console.WriteLine("Key words:\r\n");
            Console.ForegroundColor = ConsoleColor.Green;
            foreach (KeyValuePair<string, double> kp in lstBestKeyWords)
            {
                Console.WriteLine(kp.Key + "  " + kp.Value);
            }

        }
    }
}