RSS
热门关键字:  数据挖掘  人工智能  数据仓库  搜索引擎  数据挖掘导论

Nutch中Analysis包下的NutchAnalysis.jj解读

来源: 作者:unkonwn 时间:2004-12-06 点击:

学习完javacc语法之后,再读NutchAnalysis.jj就轻松多了.原文解读如下,省略了大片的注释!nutch的分词,主要有 数据挖掘实验室

此文件完成. 数据挖掘研究院

原文如下:

数据挖掘研究院

/** JavaCC code for the Nutch lexical analyzer. */
/**修改此文件,以使Nutch支持中文分词*/

options {
  STATIC = false;
  USER_CHAR_STREAM = true;
  OPTIMIZE_TOKEN_MANAGER = true;
  UNICODE_INPUT = true;
//DEBUG_TOKEN_MANAGER = true;
} 数据挖掘研究院

PARSER_BEGIN(NutchAnalysis)
//PARSER_BEGIN和PARSER_END之间的import declaration会包含于产生的Parser和TokenManager文件中
package org.apache.nutch.analysis;

import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.QueryFilters;
import org.apache.nutch.searcher.Query.Clause; 数据挖掘研究院

import org.apache.lucene.analysis.StopFilter; 数据挖掘研究院

import java.io.*;
import java.util.*; 数据挖掘研究院

/** The JavaCC-generated Nutch lexical analyzer and query parser. */
public class NutchAnalysis {
 
  //可以扩展此StopWords列表
  private static final String[] STOP_WORDS = {
    "a", "and", "are", "as", "at", "be", "but", "by",
    "for", "if", "in", "into", "is", "it",
    "no", "not", "of", "on", "or", "s", "such",
    "t", "that", "the", "their", "then", "there", "these",
    "they", "this", "to", "was", "will", "with"
  };

数据挖掘实验室

  private static final Set STOP_SET = StopFilter.makeStopSet(STOP_WORDS); 数据挖掘研究院

  private String queryString; 数据挖掘研究院

  /** True iff word is a stop word.  Stop words are only removed from queries.
   * Every word is indexed.  */
   //只在查询时去除StopWords??
  public static boolean isStopWord(String word) {
    return STOP_SET.contains(word);
  } 数据挖掘研究院

  /** Construct a query parser for the text in a reader. */
  public static Query parseQuery(String queryString) throws IOException {
    NutchAnalysis parser =
      new NutchAnalysis(new FastCharStream(new StringReader(queryString)));
    parser.queryString = queryString;
    return parser.parse();
  } 数据挖掘研究院

  /** For debugging. */
  public static void main(String[] args) throws Exception {
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    while (true) {
      System.out.print("Query: ");
      String line = in.readLine();
      System.out.println(parseQuery(line));
    }
  }

数据挖掘研究院

} 数据挖掘实验室

PARSER_END(NutchAnalysis) 数据挖掘研究院

TOKEN_MGR_DECLS : {

  /** Constructs a token manager for the provided Reader. */
  public NutchAnalysisTokenManager(Reader reader) {
    this(new FastCharStream(reader));
  } 数据挖掘实验室

} 数据挖掘实验室

TOKEN : {       // token regular expressions

  // basic word -- lowercase it
  //基本词,中间没有whitespace,{ } 内为对其的操作
<WORD: ((<LETTER>|<DIGIT>|<WORD_PUNCT>)+ | <IRREGULAR_WORD>)>
  { matchedToken.image = matchedToken.image.toLowerCase(); } 数据挖掘研究院

  // special handling for acronyms: U.S.A., I.B.M., etc: dots are removed
  //对于缩略词,去除了dot
| <ACRONYM: <LETTER> "." (<LETTER> ".")+ >
    {                                             // remove dots
      for (int i = 0; i < image.length(); i++) {
 if (image.charAt(i) == ′.′)
   image.deleteCharAt(i--);
      }
      matchedToken.image = image.toString().toLowerCase();
    } 数据挖掘研究院

  // chinese, japanese and korean characters
  //SIGRAM:Single Gram,单字切分
| <SIGRAM: <CJK> > 数据挖掘研究院

   // irregular words
   //不规则单词
| <#IRREGULAR_WORD: (<C_PLUS_PLUS>|<C_SHARP>)>
| <#C_PLUS_PLUS: ("C"|"c") "++" >
| <#C_SHARP: ("C"|"c") "#" > 数据挖掘研究院

  // query syntax characters
  //用于查询语法的字符
| <PLUS: "+" >
| <MINUS: "-" >
  //引号
| <QUOTE: """ >
| <COLON: ":" >
| <SLASH: "/" >
| <DOT: "." >
| <ATSIGN: "@" >
| <APOSTROPHE: "′" >   //省略符号
//将不认识的字符做为空白(whitespace)处理
| <WHITE: ~[] >                                   // treat unrecognized chars
                                                  // as whitespace
// primitive, non-token patterns 数据挖掘研究院

| <#WORD_PUNCT: ("_"|"&")>                        // allowed anywhere in words 数据挖掘实验室

| < #LETTER:       // alphabets
    [
        "u0041"-"u005a",
        "u0061"-"u007a",
        "u00c0"-"u00d6",
        "u00d8"-"u00f6",
        "u00f8"-"u00ff",
        "u0100"-"u1fff"
    ]
    > 数据挖掘研究院

|  <#CJK:                                        // non-alphabets
      [
       "u3040"-"u318f",
       "u3300"-"u337f",
       "u3400"-"u3d2d",
       "u4e00"-"u9fff",
       "uf900"-"ufaff"
      ]
    >   

| < #DIGIT:       // unicode digits
      [
       "u0030"-"u0039",
       "u0660"-"u0669",
       "u06f0"-"u06f9",
       "u0966"-"u096f",
       "u09e6"-"u09ef",
       "u0a66"-"u0a6f",
       "u0ae6"-"u0aef",
       "u0b66"-"u0b6f",
       "u0be7"-"u0bef",
       "u0c66"-"u0c6f",
       "u0ce6"-"u0cef",
       "u0d66"-"u0d6f",
       "u0e50"-"u0e59",
       "u0ed0"-"u0ed9",
       "u1040"-"u1049"
      ] 数据挖掘研究院
  > 数据挖掘研究院

}

数据挖掘研究院


/** Parse a query. */
//对于一个查询进行语法解析
Query parse() :
{
 //初始化
  Query query = new Query();
  ArrayList terms;
  Token token;
  String field;
  boolean stop;
  boolean prohibited;

数据挖掘研究院

}
{
  nonOpOrTerm()                                   // skip noise
  (
     //field默认为"DEFAULT"
    { stop=true; prohibited=false; field = Clause.DEFAULT_FIELD; } 数据挖掘研究院

                                                  // optional + or - operator
     //+表示查询此关键词,-表示查询不包含此关键词的文档
    ( <PLUS> {stop=false;} | (<MINUS> { stop=false;prohibited=true; } ))?

数据挖掘研究院

                                                  // optional field spec.
    ( LOOKAHEAD(<WORD><COLON>(phrase(field)|compound(field)))
      token=<WORD> <COLON> { field = token.image; } )? 数据挖掘实验室

    ( terms=phrase(field) {stop=false;} |         // quoted terms or
      terms=compound(field))                      // single or compound term 数据挖掘实验室

    nonOpOrTerm()                                 // skip noise 数据挖掘研究院

    {
      String[] array = (String[])terms.toArray(new String[terms.size()]); 数据挖掘实验室

      if (stop
          && field == Clause.DEFAULT_FIELD
          && terms.size()==1
          && isStopWord(array[0])) {
        // ignore stop words only when single, unadorned terms in default field
      } else {
        if (prohibited)
          query.addProhibitedPhrase(array, field);
        else
          query.addRequiredPhrase(array, field);
      }
    }
  )*
 
  { return query; } 数据挖掘实验室

}

数据挖掘研究院

/** Parse an explcitly quoted phrase query.  Note that this may return a single
 * term, a trivial phrase.*/
 //对于短语查询(phrase query)进行语法解析
ArrayList phrase(String field) :
{
  int start;
  int end;
  ArrayList result = new ArrayList();
  String term;
}
{
  <QUOTE>

  { start = token.endColumn; }
 
  (nonTerm())*                                    // skip noise
  ( term = term() { result.add(term); }           // parse a term
    (nonTerm())*)*                                // skip noise 数据挖掘实验室

  { end = token.endColumn; }

数据挖掘研究院

  (<QUOTE>|<EOF>)
   
  {
    if (QueryFilters.isRawField(field)) {
      result.clear();
      //返回子串
      result.add(queryString.substring(start, end));
    }
    return result;
  } 数据挖掘研究院

} 数据挖掘研究院

/** Parse a compound term that is interpreted as an implicit phrase query.
 * Compounds are a sequence of terms separated by infix characters.  Note that
 * htis may return a single term, a trivial compound. */
ArrayList compound(String field) :
{
  int start;
  ArrayList result = new ArrayList();
  String term;
}
{
  { start = token.endColumn; }

数据挖掘研究院

  term = term() { result.add(term); }
  ( LOOKAHEAD( (infix())+ term() )
    (infix())+
    term = term() { result.add(term); })*

数据挖掘实验室

  {
    if (QueryFilters.isRawField(field)) {
      result.clear();
      result.add(queryString.substring(start, token.endColumn));
    }
    return result;
  }

} 数据挖掘实验室

/** Parse a single term. */
String term() :
{
  Token token;
}
{
 
  ( token=<WORD> | token=<ACRONYM> | token=<SIGRAM>)

数据挖掘研究院

  { return token.image; }
}


/** Parse anything but a term or a quote. */
void nonTerm() :
{}
{
  <WHITE> | infix()
}
//不是Term,或者为EOF
void nonTermOrEOF() :
{}
{
  nonTerm() | <EOF>
} 数据挖掘研究院

/** Parse anything but a term or an operator (plus or minus or quote).*/
//operator操作符即(plus,minus,quote),这其中不包含了plus和minus吗?困惑?
void nonOpOrTerm() :
{}
{
  (LOOKAHEAD(2) (<WHITE> | nonOpInfix() | ((<PLUS>|<MINUS>) nonTermOrEOF())))*
}

数据挖掘研究院

/** Characters which can be used to form compound terms. */
//infix即为插入词,不包括引号
void infix() :
{}
{
  <PLUS> | <MINUS> | nonOpInfix()
}

/** Parse infix characters except plus and minus. */
//排除operator的infix
void nonOpInfix() :
{}
{
  <COLON>|<SLASH>|<DOT>|<ATSIGN>|<APOSTROPHE>
} 数据挖掘研究院

  数据挖掘研究院


最新评论共有 0 位网友发表了评论
发表评论
评论内容:不能超过250字,需审核,请自觉遵守互联网相关政策法规。
匿名?