受欢迎的博客标签

英文关键字提取工具

Published

开源工具

RAKE(https://github.com/zelandiya/RAKE-tutorial)
KEA(http://www.nzdl.org/Kea/) 监督式机器学习,使用训练数据和受控词表。
maui indexer(https://code.google.com/archive/p/maui-indexer/) 在kea的基础上进行拓展,增加新的特征项目,拓展了维基百科作为受控词表。
carrot2(http://project.carrot2.org/) 无监督方法,支持多种输入,输出格式和参数设置。
mallet topic modeling module(http://mallet.cs.umass.edu/topics.php)
Stanford topic modeling tool (http://nlp.stanford.edu/software/tmt/tmt-0.3/)
Mahout clustering algorithms(http://mahout.apache.org/)

商业API


Alchemy API(http://www.alchemyapi.com/api/keyword-extraction)
zemanta API(http://developer.zemanta.com/)
yahoo term extraction api
(https://developer.yahoo.com/search/content/V1/termExtraction.html)

开源的目前主要使用了RAKE,KEA和maui indexer。
RAKE(Rapid Automatic Keyword Extraction)算法的一篇翻译版的文章(http://python.jobbole.com/82230/),这篇文章的原作者是Alyona Medelyan,RAKE的更新版本就是她完成的,muai indexer也是她的杰作,她的GitHub上有很多关键字提取的项目。(https://github.com/zelandiya)。

 

C# 英文关键字提取工具

C# 英文关键字提取工具实现,使用 TF-IDF 算法从英文文本中提取最重要的关键词

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using System.IO;

namespace KeywordExtractor
{
    public static class EnglishKeywordExtractor
    {
        // 英文停用词列表
        private static readonly HashSet<string> StopWords = new HashSet<string>
        {
            "a", "an", "the", "and", "or", "but", "if", "then", "else", "when", "at", "from", "into", "during", 
            "to", "of", "for", "in", "on", "by", "about", "with", "against", "between", "through", "before", 
            "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", 
            "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", 
            "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", 
            "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "is", 
            "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "doing", 
            "would", "could", "should", "might", "must", "i", "me", "my", "myself", "we", "our", "ours", 
            "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", 
            "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", 
            "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", 
            "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", 
            "will", "would", "shall", "should", "can", "could", "may", "might", "must", "ought", "need", "dare"
        };

        /// <summary>
        /// 从文本中提取关键词
        /// </summary>
        /// <param name="text">输入文本</param>
        /// <param name="topN">返回的关键词数量</param>
        /// <returns>关键词列表</returns>
        public static List<string> ExtractKeywords(string text, int topN = 10)
        {
            if (string.IsNullOrWhiteSpace(text))
                return new List<string>();

            // 预处理文本
            var words = TokenizeAndNormalize(text);
            
            // 计算词频
            var wordFrequencies = CalculateWordFrequencies(words);
            
            // 计算TF-IDF值
            var tfidfScores = CalculateTfIdf(wordFrequencies, words.Count);
            
            // 按TF-IDF值排序并取前N个
            return tfidfScores
                .OrderByDescending(kv => kv.Value)
                .Take(topN)
                .Select(kv => kv.Key)
                .ToList();
        }

        /// <summary>
        /// 从多个文档中提取关键词
        /// </summary>
        /// <param name="documents">文档集合</param>
        /// <param name="topN">每个文档返回的关键词数量</param>
        /// <returns>每个文档的关键词列表</returns>
        public static List<List<string>> ExtractKeywordsFromDocuments(IEnumerable<string> documents, int topN = 10)
        {
            // 预处理所有文档
            var allDocumentsWords = documents
                .Where(doc => !string.IsNullOrWhiteSpace(doc))
                .Select(TokenizeAndNormalize)
                .ToList();

            // 计算每个文档的词频
            var documentWordFrequencies = allDocumentsWords
                .Select(CalculateWordFrequencies)
                .ToList();

            // 计算文档频率(包含某个词的文档数量)
            var documentFrequency = new Dictionary<string, int>();
            foreach (var freqDict in documentWordFrequencies)
            {
                foreach (var word in freqDict.Keys)
                {
                    if (documentFrequency.ContainsKey(word))
                        documentFrequency[word]++;
                    else
                        documentFrequency[word] = 1;
                }
            }

            // 为每个文档计算TF-IDF并提取关键词
            var results = new List<List<string>>();
            for (int i = 0; i < allDocumentsWords.Count; i++)
            {
                var tfidfScores = new Dictionary<string, double>();
                var docWords = allDocumentsWords[i];
                var docWordFreq = documentWordFrequencies[i];
                int totalWords = docWords.Count;

                foreach (var word in docWordFreq.Keys)
                {
                    double tf = (double)docWordFreq[word] / totalWords;
                    double idf = Math.Log((double)allDocumentsWords.Count / (documentFrequency[word] + 1));
                    tfidfScores[word] = tf * idf;
                }

                results.Add(tfidfScores
                    .OrderByDescending(kv => kv.Value)
                    .Take(topN)
                    .Select(kv => kv.Key)
                    .ToList());
            }

            return results;
        }

        /// <summary>
        /// 分词和标准化处理
        /// </summary>
        private static List<string> TokenizeAndNormalize(string text)
        {
            // 转换为小写
            text = text.ToLowerInvariant();
            
            // 移除非字母字符(保留空格)
            text = Regex.Replace(text, @"[^a-z\s]", " ");
            
            // 分词
            var words = text.Split(new[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries)
                .ToList();
            
            // 移除停用词
            words = words.Where(word => !StopWords.Contains(word) && word.Length > 2).ToList();
            
            return words;
        }

        /// <summary>
        /// 计算词频
        /// </summary>
        private static Dictionary<string, int> CalculateWordFrequencies(List<string> words)
        {
            var frequencies = new Dictionary<string, int>();
            foreach (var word in words)
            {
                if (frequencies.ContainsKey(word))
                    frequencies[word]++;
                else
                    frequencies[word] = 1;
            }
            return frequencies;
        }

        /// <summary>
        /// 计算TF-IDF值(单文档版本)
        /// </summary>
        private static Dictionary<string, double> CalculateTfIdf(Dictionary<string, int> wordFrequencies, int totalWords)
        {
            // 对于单文档,我们无法计算IDF,所以只使用TF
            return wordFrequencies.ToDictionary(
                kv => kv.Key,
                kv => (double)kv.Value / totalWords
            );
        }

        /// <summary>
        /// 从文件中提取关键词
        /// </summary>
        public static List<string> ExtractKeywordsFromFile(string filePath, int topN = 10)
        {
            if (!File.Exists(filePath))
                throw new FileNotFoundException($"File not found: {filePath}");

            string text = File.ReadAllText(filePath);
            return ExtractKeywords(text, topN);
        }
    }

    class Program
    {
        static void Main(string[] args)
        {
            // 示例文本
            string sampleText = @"Artificial intelligence (AI) is intelligence demonstrated by machines, 
                unlike the natural intelligence displayed by humans and animals. Leading AI textbooks define 
                the field as the study of 'intelligent agents': any system that perceives its environment and 
                takes actions that maximize its chance of achieving its goals. Colloquially, the term 'artificial 
                intelligence' is often used to describe machines that mimic 'cognitive' functions that humans 
                associate with the human mind, such as 'learning' and 'problem solving'.";

            Console.WriteLine("Extracting keywords from sample text:");
            var keywords = EnglishKeywordExtractor.ExtractKeywords(sampleText);
            Console.WriteLine(string.Join(", ", keywords));
            Console.WriteLine();

            // 多文档示例
            var documents = new List<string>
            {
                "Machine learning is a subset of artificial intelligence that focuses on building systems that learn from data.",
                "Deep learning is a specialized form of machine learning that uses neural networks with many layers.",
                "Natural language processing enables computers to understand, interpret, and generate human language."
            };

            Console.WriteLine("Extracting keywords from multiple documents:");
            var allKeywords = EnglishKeywordExtractor.ExtractKeywordsFromDocuments(documents);
            for (int i = 0; i < allKeywords.Count; i++)
            {
                Console.WriteLine($"Document {i + 1}: {string.Join(", ", allKeywords[i])}");
            }
            Console.WriteLine();

            // 文件处理示例
            try
            {
                string filePath = "sample.txt";
                File.WriteAllText(filePath, sampleText);
                Console.WriteLine($"Extracting keywords from file '{filePath}':");
                var fileKeywords = EnglishKeywordExtractor.ExtractKeywordsFromFile(filePath);
                Console.WriteLine(string.Join(", ", fileKeywords));
            }
            catch (Exception ex)
            {
                Console.WriteLine($"Error: {ex.Message}");
            }
        }
    }
}