CaboCha
/home/taku/proj/cabocha/src/cabocha.h
Go to the documentation of this file.
00001 /* CaboCha -- Yet Another Japanese Dependency Parser
00002    $Id: cabocha.h 50 2009-05-03 08:25:36Z taku-ku $;
00003    Copyright(C) 2001-2008 Taku Kudo <taku@chasen.org>
00004 */
00005 #ifndef CABOCHA_CABOCHA_H_
00006 #define CABOCHA_CABOCHA_H_
00007 
00008 #ifdef __cplusplus
00009 extern "C" {
00010 #endif
00011 
00012 #include <stddef.h>
00013 
00014 #ifdef _WIN32
00015 #  ifdef DLL_EXPORT
00016 #    define CABOCHA_DLL_EXTERN    __declspec(dllexport)
00017 #  else
00018 #    ifdef  DLL_IMPORT
00019 #      define CABOCHA_DLL_EXTERN  __declspec(dllimport)
00020 #    endif
00021 #  endif
00022 #endif
00023 
00024 #ifndef CABOCHA_DLL_EXTERN
00025 #  define CABOCHA_DLL_EXTERN extern
00026 #endif
00027 
00028   enum {
00029     CABOCHA_EUC_JP = 0,
00030     CABOCHA_CP932  = 1,
00031     CABOCHA_UTF8   = 2,
00032     CABOCHA_ASCII  = 3
00033   };
00034 
00035   enum {
00036     CABOCHA_IPA    = 0,
00037     CABOCHA_JUMAN  = 1,
00038     CABOCHA_UNIDIC = 2
00039   };
00040 
00041   enum {
00042     CABOCHA_FORMAT_TREE         = 0,
00043     CABOCHA_FORMAT_LATTICE      = 1,
00044     CABOCHA_FORMAT_TREE_LATTICE = 2,
00045     CABOCHA_FORMAT_XML          = 3,
00046     CABOCHA_FORMAT_NONE         = 4
00047   };
00048 
00049   enum {
00050     CABOCHA_INPUT_RAW_SENTENCE  = 0,
00051     CABOCHA_INPUT_POS           = 1,
00052     CABOCHA_INPUT_CHUNK         = 2,
00053     CABOCHA_INPUT_SELECTION     = 3,
00054     CABOCHA_INPUT_DEP           = 4
00055   };
00056 
00057   enum {
00058     CABOCHA_OUTPUT_RAW_SENTENCE = 0,
00059     CABOCHA_OUTPUT_POS          = 1,
00060     CABOCHA_OUTPUT_CHUNK        = 2,
00061     CABOCHA_OUTPUT_SELECTION    = 3,
00062     CABOCHA_OUTPUT_DEP          = 4
00063   };
00064 
00065   enum {
00066     CABOCHA_TRAIN_NE    = 0,
00067     CABOCHA_TRAIN_CHUNK = 1,
00068     CABOCHA_TRAIN_DEP   = 2,
00069   };
00070 
00071   enum {
00072     CABOCHA_SHIFT_REDUCE = 0,
00073     CABOCHA_TOURNAMENT   = 1,
00074   };
00075 
00076   typedef struct cabocha_t cabocha_t;
00077   typedef struct cabocha_tree_t cabocha_tree_t;
00078   struct mecab_node_t;
00079 
00080   struct cabocha_chunk_t {
00081     int                    link;
00082     unsigned short int     head_pos;
00083     unsigned short int     func_pos;
00084     unsigned short int     token_size;
00085     size_t                 token_pos;
00086     float                  score;
00087     const char             **feature_list;
00088     const char             *additional_info;
00089     unsigned short int     feature_list_size;
00090   };
00091 
00092   struct cabocha_token_t {
00093     const char              *surface;
00094     const char              *normalized_surface;
00095     const char              *feature;
00096     const char             **feature_list;
00097     unsigned short int      feature_list_size;
00098     const char              *ne;
00099     const char              *additional_info;
00100     struct cabocha_chunk_t  *chunk;
00101   };
00102 
00103   typedef struct cabocha_t  cabocha_t;
00104   typedef struct cabocha_tree_t  cabocha_tree_t;
00105   typedef struct cabocha_chunk_t cabocha_chunk_t;
00106   typedef struct cabocha_token_t cabocha_token_t;
00107   typedef struct mecab_node_t mecab_node_t;
00108 
00109 #ifndef SWIG
00110   CABOCHA_DLL_EXTERN int                    cabocha_do(int argc, char **argv);
00111 
00112   /* parser */
00113   CABOCHA_DLL_EXTERN cabocha_t             *cabocha_new(int argc, char **argv);
00114   CABOCHA_DLL_EXTERN cabocha_t             *cabocha_new2(const char *arg);
00115   CABOCHA_DLL_EXTERN const char            *cabocha_strerror(cabocha_t* cabocha);
00116   CABOCHA_DLL_EXTERN const cabocha_tree_t  *cabocha_parse_tree(cabocha_t *cabocha,
00117                                                                cabocha_tree_t *tree);
00118   CABOCHA_DLL_EXTERN const char            *cabocha_sparse_tostr(cabocha_t* cabocha,
00119                                                                  const char* str);
00120   CABOCHA_DLL_EXTERN const char            *cabocha_sparse_tostr2(cabocha_t* cabocha,
00121                                                                   const char* str, size_t lenght);
00122   CABOCHA_DLL_EXTERN const char            *cabocha_sparse_tostr3(cabocha_t* cabocha, const char* str, size_t length,
00123                                                                   char *output_str, size_t output_length);
00124   CABOCHA_DLL_EXTERN void                  cabocha_destroy(cabocha_t* cabocha);
00125   CABOCHA_DLL_EXTERN const cabocha_tree_t  *cabocha_sparse_totree(cabocha_t* cabocha, const char* str);
00126   CABOCHA_DLL_EXTERN const cabocha_tree_t  *cabocha_sparse_totree2(cabocha_t* cabocha, const char* str, size_t length);
00127   CABOCHA_DLL_EXTERN const cabocha_tree_t  *cabocha_parse_tree(cabocha_t* cabocha, cabocha_tree_t *tree);
00128 
00129   /* tree */
00130   CABOCHA_DLL_EXTERN cabocha_tree_t        *cabocha_tree_new();
00131   CABOCHA_DLL_EXTERN void                   cabocha_tree_destroy(cabocha_tree_t* tree);
00132   CABOCHA_DLL_EXTERN int                    cabocha_tree_empty(cabocha_tree_t* tree);
00133   CABOCHA_DLL_EXTERN void                   cabocha_tree_clear(cabocha_tree_t* tree);
00134   CABOCHA_DLL_EXTERN void                   cabocha_tree_clear_chunk(cabocha_tree_t* tree);
00135   CABOCHA_DLL_EXTERN size_t                 cabocha_tree_size(cabocha_tree_t* tree);
00136   CABOCHA_DLL_EXTERN size_t                 cabocha_tree_chunk_size(cabocha_tree_t* tree);
00137   CABOCHA_DLL_EXTERN size_t                 cabocha_tree_token_size(cabocha_tree_t* tree);
00138   CABOCHA_DLL_EXTERN const char            *cabocha_tree_sentence(cabocha_tree_t* tree);
00139   CABOCHA_DLL_EXTERN size_t                 cabocha_tree_sentence_size(cabocha_tree_t* tree);
00140   CABOCHA_DLL_EXTERN void                   cabocha_tree_set_sentence(cabocha_tree_t* tree,
00141                                                                       const char *sentence,
00142                                                                       size_t length);
00143   CABOCHA_DLL_EXTERN int                   cabocha_tree_read(cabocha_tree_t* tree,
00144                                                              const char *input,
00145                                                              size_t length,
00146                                                              int input_layer);
00147   CABOCHA_DLL_EXTERN int                   cabocha_tree_read_from_mecab_node(cabocha_tree_t* tree,
00148                                                                              const mecab_node_t *node);
00149 
00150   CABOCHA_DLL_EXTERN const cabocha_token_t *cabocha_tree_token(cabocha_tree_t* tree, size_t i);
00151   CABOCHA_DLL_EXTERN const cabocha_chunk_t *cabocha_tree_chunk(cabocha_tree_t* tree, size_t i);
00152 
00153   CABOCHA_DLL_EXTERN cabocha_token_t       *cabocha_tree_add_token(cabocha_tree_t* tree);
00154   CABOCHA_DLL_EXTERN cabocha_chunk_t       *cabocha_tree_add_chunk(cabocha_tree_t* tree);
00155 
00156   CABOCHA_DLL_EXTERN char                  *cabocha_tree_strdup(cabocha_tree_t* tree, const char *str);
00157   CABOCHA_DLL_EXTERN char                  *cabocha_tree_alloc(cabocha_tree_t* tree, size_t size);
00158 
00159   CABOCHA_DLL_EXTERN const char            *cabocha_tree_tostr(cabocha_tree_t* tree, int format);
00160   CABOCHA_DLL_EXTERN const char            *cabocha_tree_tostr2(cabocha_tree_t* tree, int format,
00161                                                                 char *str, size_t length);
00162 
00163   CABOCHA_DLL_EXTERN void                   cabocha_tree_set_charset(cabocha_tree_t* tree,
00164                                                                      int charset);
00165   CABOCHA_DLL_EXTERN int                    cabocha_tree_charset(cabocha_tree_t* tree);
00166   CABOCHA_DLL_EXTERN void                   cabocha_tree_set_posset(cabocha_tree_t* tree,
00167                                                                     int posset);
00168   CABOCHA_DLL_EXTERN int                    cabocha_tree_posset(cabocha_tree_t* tree);
00169   CABOCHA_DLL_EXTERN void                   cabocha_tree_set_output_layer(cabocha_tree_t* tree,
00170                                                                           int output_layer);
00171   CABOCHA_DLL_EXTERN int                    cabocha_tree_output_layer(cabocha_tree_t* tree);
00172 
00173   CABOCHA_DLL_EXTERN int                    cabocha_learn(int argc, char **argv);
00174   CABOCHA_DLL_EXTERN int                    cabocha_system_eval(int argc, char **argv);
00175   CABOCHA_DLL_EXTERN int                    cabocha_model_index(int argc, char **argv);
00176 #endif
00177 
00178 #ifdef __cplusplus
00179 }
00180 #endif
00181 
00182 /* for C++ */
00183 #ifdef __cplusplus
00184 
00185 namespace CaboCha {
00186 
00187 class Tree;
00188 typedef struct cabocha_chunk_t Chunk;
00189 typedef struct cabocha_token_t Token;
00190 
00191 enum CharsetType {
00192   EUC_JP = CABOCHA_EUC_JP,
00193   CP932  = CABOCHA_CP932,
00194   UTF8   = CABOCHA_UTF8,
00195   ASCII  = CABOCHA_ASCII
00196 };
00197 
00198 enum PossetType  {
00199   IPA    = CABOCHA_IPA,
00200   JUMAN  = CABOCHA_JUMAN,
00201   UNIDIC = CABOCHA_UNIDIC
00202 };
00203 
00204 enum FormatType {
00205   FORMAT_TREE         = CABOCHA_FORMAT_TREE,
00206   FORMAT_LATTICE      = CABOCHA_FORMAT_LATTICE,
00207   FORMAT_TREE_LATTICE = CABOCHA_FORMAT_TREE_LATTICE,
00208   FORMAT_XML          = CABOCHA_FORMAT_XML,
00209   FORMAT_NONE         = CABOCHA_FORMAT_NONE
00210 };
00211 
00212 enum InputLayerType {
00213   INPUT_RAW_SENTENCE = CABOCHA_INPUT_RAW_SENTENCE,
00214   INPUT_POS          = CABOCHA_INPUT_POS,
00215   INPUT_CHUNK        = CABOCHA_INPUT_CHUNK,
00216   INPUT_SELECTION    = CABOCHA_INPUT_SELECTION,
00217   INPUT_DEP          = CABOCHA_INPUT_DEP
00218 };
00219 
00220 enum OutputLayerType {
00221   OUTPUT_RAW_SENTENCE = CABOCHA_OUTPUT_RAW_SENTENCE,
00222   OUTPUT_POS          = CABOCHA_OUTPUT_POS,
00223   OUTPUT_CHUNK        = CABOCHA_OUTPUT_CHUNK,
00224   OUTPUT_SELECTION    = CABOCHA_OUTPUT_SELECTION,
00225   OUTPUT_DEP          = CABOCHA_OUTPUT_DEP
00226 };
00227 
00228 enum ParserType {
00229   TRAIN_NE    = CABOCHA_TRAIN_NE,
00230   TRAIN_CHUNK = CABOCHA_TRAIN_CHUNK,
00231   TRAIN_DEP   = CABOCHA_TRAIN_DEP
00232 };
00233 
00234 enum ParsingAlgorithm {
00235   SHIFT_REDUCE = CABOCHA_SHIFT_REDUCE,
00236   TOURNAMENT = CABOCHA_TOURNAMENT,
00237 };
00238 
00239 class TreeAllocator;
00240 
00241 class Tree {
00242  public:
00243   void set_sentence(const char *sentence);
00244   const char *sentence() const;
00245   size_t sentence_size() const;
00246 
00247 #ifndef SWIG
00248   void set_sentence(const char *sentence, size_t length);
00249 #endif
00250 
00251   const Chunk *chunk(size_t i) const;
00252   const Token *token(size_t i) const;
00253 
00254 #ifndef SWIG
00255   Chunk *mutable_chunk(size_t i);
00256   Token *mutable_token(size_t i);
00257 
00258   Token *add_token();
00259   Chunk *add_chunk();
00260 
00261   char *strdup(const char *str);
00262   char *alloc(size_t size);
00263   char **alloc_char_array(size_t size);
00264 
00265   TreeAllocator *allocator() const;
00266 #endif
00267 
00268   bool   read(const char *input,
00269               InputLayerType input_layer);
00270 
00271 #ifndef SWIG
00272   bool   read(const char *input, size_t length,
00273               InputLayerType input_layer);
00274   bool   read(const mecab_node_t *node);
00275 #endif
00276 
00277   bool   empty() const;
00278   void   clear();
00279   void   clear_chunk();
00280 
00281   size_t chunk_size() const;
00282   size_t token_size() const;
00283   size_t size() const;
00284 
00285   const char *toString(FormatType output_format);
00286 
00287 #ifndef SWIG
00288   const char *toString(FormatType output_format,
00289                        char *output, size_t length) const;
00290 #endif
00291 
00292   CharsetType charset() const { return charset_; }
00293   void set_charset(CharsetType charset) { charset_ = charset; }
00294   PossetType posset() const { return posset_; }
00295   void set_posset(PossetType posset) { posset_ = posset; }
00296   OutputLayerType output_layer() const { return output_layer_; }
00297   void set_output_layer(OutputLayerType output_layer) { output_layer_ = output_layer; }
00298 
00299   const char *what();
00300 
00301   explicit Tree();
00302   virtual ~Tree();
00303 
00304  private:
00305   TreeAllocator              *tree_allocator_;
00306   CharsetType                 charset_;
00307   PossetType                  posset_;
00308   OutputLayerType             output_layer_;
00309 };
00310 
00311 class Parser {
00312  public:
00313   virtual const Tree *parse(const char *input)                          = 0;
00314   virtual const char *parseToString(const char *input)                  = 0;
00315   virtual const Tree *parse(Tree *tree) const                           = 0;
00316 
00317 #ifndef SWIG
00318   virtual const Tree *parse(const char *input, size_t length)           = 0;
00319   virtual const char *parseToString(const char *input, size_t length)   = 0;
00320   virtual const char *parseToString(const char *input, size_t length,
00321                                     char       *output, size_t output_length) = 0;
00322 #endif
00323 
00324   virtual const char *what() = 0;
00325   static const char *version();
00326 
00327   virtual ~Parser() {}
00328 
00329 #ifndef SWIG
00330   static Parser *create(int argc, char **argv);
00331   static Parser *create(const char *arg);
00332 #endif
00333 };
00334 
00335 CABOCHA_DLL_EXTERN Parser *createParser(int argc, char **argv);
00336 CABOCHA_DLL_EXTERN Parser *createParser(const char *arg);
00337 CABOCHA_DLL_EXTERN const char *getParserError();
00338 CABOCHA_DLL_EXTERN const char *getLastError();
00339 }
00340 #endif
00341 #endif