CaboCha
|
00001 /* CaboCha -- Yet Another Japanese Dependency Parser 00002 $Id: cabocha.h 50 2009-05-03 08:25:36Z taku-ku $; 00003 Copyright(C) 2001-2008 Taku Kudo <taku@chasen.org> 00004 */ 00005 #ifndef CABOCHA_CABOCHA_H_ 00006 #define CABOCHA_CABOCHA_H_ 00007 00008 #ifdef __cplusplus 00009 extern "C" { 00010 #endif 00011 00012 #include <stddef.h> 00013 00014 #ifdef _WIN32 00015 # ifdef DLL_EXPORT 00016 # define CABOCHA_DLL_EXTERN __declspec(dllexport) 00017 # else 00018 # ifdef DLL_IMPORT 00019 # define CABOCHA_DLL_EXTERN __declspec(dllimport) 00020 # endif 00021 # endif 00022 #endif 00023 00024 #ifndef CABOCHA_DLL_EXTERN 00025 # define CABOCHA_DLL_EXTERN extern 00026 #endif 00027 00028 enum { 00029 CABOCHA_EUC_JP = 0, 00030 CABOCHA_CP932 = 1, 00031 CABOCHA_UTF8 = 2, 00032 CABOCHA_ASCII = 3 00033 }; 00034 00035 enum { 00036 CABOCHA_IPA = 0, 00037 CABOCHA_JUMAN = 1, 00038 CABOCHA_UNIDIC = 2 00039 }; 00040 00041 enum { 00042 CABOCHA_FORMAT_TREE = 0, 00043 CABOCHA_FORMAT_LATTICE = 1, 00044 CABOCHA_FORMAT_TREE_LATTICE = 2, 00045 CABOCHA_FORMAT_XML = 3, 00046 CABOCHA_FORMAT_NONE = 4 00047 }; 00048 00049 enum { 00050 CABOCHA_INPUT_RAW_SENTENCE = 0, 00051 CABOCHA_INPUT_POS = 1, 00052 CABOCHA_INPUT_CHUNK = 2, 00053 CABOCHA_INPUT_SELECTION = 3, 00054 CABOCHA_INPUT_DEP = 4 00055 }; 00056 00057 enum { 00058 CABOCHA_OUTPUT_RAW_SENTENCE = 0, 00059 CABOCHA_OUTPUT_POS = 1, 00060 CABOCHA_OUTPUT_CHUNK = 2, 00061 CABOCHA_OUTPUT_SELECTION = 3, 00062 CABOCHA_OUTPUT_DEP = 4 00063 }; 00064 00065 enum { 00066 CABOCHA_TRAIN_NE = 0, 00067 CABOCHA_TRAIN_CHUNK = 1, 00068 CABOCHA_TRAIN_DEP = 2, 00069 }; 00070 00071 enum { 00072 CABOCHA_SHIFT_REDUCE = 0, 00073 CABOCHA_TOURNAMENT = 1, 00074 }; 00075 00076 typedef struct cabocha_t cabocha_t; 00077 typedef struct cabocha_tree_t cabocha_tree_t; 00078 struct mecab_node_t; 00079 00080 struct cabocha_chunk_t { 00081 int link; 00082 unsigned short int head_pos; 00083 unsigned short int func_pos; 00084 unsigned short int token_size; 00085 size_t token_pos; 00086 float score; 00087 const char **feature_list; 00088 const char *additional_info; 00089 unsigned short int feature_list_size; 00090 }; 00091 00092 struct cabocha_token_t { 00093 const char *surface; 00094 const char *normalized_surface; 00095 const char *feature; 00096 const char **feature_list; 00097 unsigned short int feature_list_size; 00098 const char *ne; 00099 const char *additional_info; 00100 struct cabocha_chunk_t *chunk; 00101 }; 00102 00103 typedef struct cabocha_t cabocha_t; 00104 typedef struct cabocha_tree_t cabocha_tree_t; 00105 typedef struct cabocha_chunk_t cabocha_chunk_t; 00106 typedef struct cabocha_token_t cabocha_token_t; 00107 typedef struct mecab_node_t mecab_node_t; 00108 00109 #ifndef SWIG 00110 CABOCHA_DLL_EXTERN int cabocha_do(int argc, char **argv); 00111 00112 /* parser */ 00113 CABOCHA_DLL_EXTERN cabocha_t *cabocha_new(int argc, char **argv); 00114 CABOCHA_DLL_EXTERN cabocha_t *cabocha_new2(const char *arg); 00115 CABOCHA_DLL_EXTERN const char *cabocha_strerror(cabocha_t* cabocha); 00116 CABOCHA_DLL_EXTERN const cabocha_tree_t *cabocha_parse_tree(cabocha_t *cabocha, 00117 cabocha_tree_t *tree); 00118 CABOCHA_DLL_EXTERN const char *cabocha_sparse_tostr(cabocha_t* cabocha, 00119 const char* str); 00120 CABOCHA_DLL_EXTERN const char *cabocha_sparse_tostr2(cabocha_t* cabocha, 00121 const char* str, size_t lenght); 00122 CABOCHA_DLL_EXTERN const char *cabocha_sparse_tostr3(cabocha_t* cabocha, const char* str, size_t length, 00123 char *output_str, size_t output_length); 00124 CABOCHA_DLL_EXTERN void cabocha_destroy(cabocha_t* cabocha); 00125 CABOCHA_DLL_EXTERN const cabocha_tree_t *cabocha_sparse_totree(cabocha_t* cabocha, const char* str); 00126 CABOCHA_DLL_EXTERN const cabocha_tree_t *cabocha_sparse_totree2(cabocha_t* cabocha, const char* str, size_t length); 00127 CABOCHA_DLL_EXTERN const cabocha_tree_t *cabocha_parse_tree(cabocha_t* cabocha, cabocha_tree_t *tree); 00128 00129 /* tree */ 00130 CABOCHA_DLL_EXTERN cabocha_tree_t *cabocha_tree_new(); 00131 CABOCHA_DLL_EXTERN void cabocha_tree_destroy(cabocha_tree_t* tree); 00132 CABOCHA_DLL_EXTERN int cabocha_tree_empty(cabocha_tree_t* tree); 00133 CABOCHA_DLL_EXTERN void cabocha_tree_clear(cabocha_tree_t* tree); 00134 CABOCHA_DLL_EXTERN void cabocha_tree_clear_chunk(cabocha_tree_t* tree); 00135 CABOCHA_DLL_EXTERN size_t cabocha_tree_size(cabocha_tree_t* tree); 00136 CABOCHA_DLL_EXTERN size_t cabocha_tree_chunk_size(cabocha_tree_t* tree); 00137 CABOCHA_DLL_EXTERN size_t cabocha_tree_token_size(cabocha_tree_t* tree); 00138 CABOCHA_DLL_EXTERN const char *cabocha_tree_sentence(cabocha_tree_t* tree); 00139 CABOCHA_DLL_EXTERN size_t cabocha_tree_sentence_size(cabocha_tree_t* tree); 00140 CABOCHA_DLL_EXTERN void cabocha_tree_set_sentence(cabocha_tree_t* tree, 00141 const char *sentence, 00142 size_t length); 00143 CABOCHA_DLL_EXTERN int cabocha_tree_read(cabocha_tree_t* tree, 00144 const char *input, 00145 size_t length, 00146 int input_layer); 00147 CABOCHA_DLL_EXTERN int cabocha_tree_read_from_mecab_node(cabocha_tree_t* tree, 00148 const mecab_node_t *node); 00149 00150 CABOCHA_DLL_EXTERN const cabocha_token_t *cabocha_tree_token(cabocha_tree_t* tree, size_t i); 00151 CABOCHA_DLL_EXTERN const cabocha_chunk_t *cabocha_tree_chunk(cabocha_tree_t* tree, size_t i); 00152 00153 CABOCHA_DLL_EXTERN cabocha_token_t *cabocha_tree_add_token(cabocha_tree_t* tree); 00154 CABOCHA_DLL_EXTERN cabocha_chunk_t *cabocha_tree_add_chunk(cabocha_tree_t* tree); 00155 00156 CABOCHA_DLL_EXTERN char *cabocha_tree_strdup(cabocha_tree_t* tree, const char *str); 00157 CABOCHA_DLL_EXTERN char *cabocha_tree_alloc(cabocha_tree_t* tree, size_t size); 00158 00159 CABOCHA_DLL_EXTERN const char *cabocha_tree_tostr(cabocha_tree_t* tree, int format); 00160 CABOCHA_DLL_EXTERN const char *cabocha_tree_tostr2(cabocha_tree_t* tree, int format, 00161 char *str, size_t length); 00162 00163 CABOCHA_DLL_EXTERN void cabocha_tree_set_charset(cabocha_tree_t* tree, 00164 int charset); 00165 CABOCHA_DLL_EXTERN int cabocha_tree_charset(cabocha_tree_t* tree); 00166 CABOCHA_DLL_EXTERN void cabocha_tree_set_posset(cabocha_tree_t* tree, 00167 int posset); 00168 CABOCHA_DLL_EXTERN int cabocha_tree_posset(cabocha_tree_t* tree); 00169 CABOCHA_DLL_EXTERN void cabocha_tree_set_output_layer(cabocha_tree_t* tree, 00170 int output_layer); 00171 CABOCHA_DLL_EXTERN int cabocha_tree_output_layer(cabocha_tree_t* tree); 00172 00173 CABOCHA_DLL_EXTERN int cabocha_learn(int argc, char **argv); 00174 CABOCHA_DLL_EXTERN int cabocha_system_eval(int argc, char **argv); 00175 CABOCHA_DLL_EXTERN int cabocha_model_index(int argc, char **argv); 00176 #endif 00177 00178 #ifdef __cplusplus 00179 } 00180 #endif 00181 00182 /* for C++ */ 00183 #ifdef __cplusplus 00184 00185 namespace CaboCha { 00186 00187 class Tree; 00188 typedef struct cabocha_chunk_t Chunk; 00189 typedef struct cabocha_token_t Token; 00190 00191 enum CharsetType { 00192 EUC_JP = CABOCHA_EUC_JP, 00193 CP932 = CABOCHA_CP932, 00194 UTF8 = CABOCHA_UTF8, 00195 ASCII = CABOCHA_ASCII 00196 }; 00197 00198 enum PossetType { 00199 IPA = CABOCHA_IPA, 00200 JUMAN = CABOCHA_JUMAN, 00201 UNIDIC = CABOCHA_UNIDIC 00202 }; 00203 00204 enum FormatType { 00205 FORMAT_TREE = CABOCHA_FORMAT_TREE, 00206 FORMAT_LATTICE = CABOCHA_FORMAT_LATTICE, 00207 FORMAT_TREE_LATTICE = CABOCHA_FORMAT_TREE_LATTICE, 00208 FORMAT_XML = CABOCHA_FORMAT_XML, 00209 FORMAT_NONE = CABOCHA_FORMAT_NONE 00210 }; 00211 00212 enum InputLayerType { 00213 INPUT_RAW_SENTENCE = CABOCHA_INPUT_RAW_SENTENCE, 00214 INPUT_POS = CABOCHA_INPUT_POS, 00215 INPUT_CHUNK = CABOCHA_INPUT_CHUNK, 00216 INPUT_SELECTION = CABOCHA_INPUT_SELECTION, 00217 INPUT_DEP = CABOCHA_INPUT_DEP 00218 }; 00219 00220 enum OutputLayerType { 00221 OUTPUT_RAW_SENTENCE = CABOCHA_OUTPUT_RAW_SENTENCE, 00222 OUTPUT_POS = CABOCHA_OUTPUT_POS, 00223 OUTPUT_CHUNK = CABOCHA_OUTPUT_CHUNK, 00224 OUTPUT_SELECTION = CABOCHA_OUTPUT_SELECTION, 00225 OUTPUT_DEP = CABOCHA_OUTPUT_DEP 00226 }; 00227 00228 enum ParserType { 00229 TRAIN_NE = CABOCHA_TRAIN_NE, 00230 TRAIN_CHUNK = CABOCHA_TRAIN_CHUNK, 00231 TRAIN_DEP = CABOCHA_TRAIN_DEP 00232 }; 00233 00234 enum ParsingAlgorithm { 00235 SHIFT_REDUCE = CABOCHA_SHIFT_REDUCE, 00236 TOURNAMENT = CABOCHA_TOURNAMENT, 00237 }; 00238 00239 class TreeAllocator; 00240 00241 class Tree { 00242 public: 00243 void set_sentence(const char *sentence); 00244 const char *sentence() const; 00245 size_t sentence_size() const; 00246 00247 #ifndef SWIG 00248 void set_sentence(const char *sentence, size_t length); 00249 #endif 00250 00251 const Chunk *chunk(size_t i) const; 00252 const Token *token(size_t i) const; 00253 00254 #ifndef SWIG 00255 Chunk *mutable_chunk(size_t i); 00256 Token *mutable_token(size_t i); 00257 00258 Token *add_token(); 00259 Chunk *add_chunk(); 00260 00261 char *strdup(const char *str); 00262 char *alloc(size_t size); 00263 char **alloc_char_array(size_t size); 00264 00265 TreeAllocator *allocator() const; 00266 #endif 00267 00268 bool read(const char *input, 00269 InputLayerType input_layer); 00270 00271 #ifndef SWIG 00272 bool read(const char *input, size_t length, 00273 InputLayerType input_layer); 00274 bool read(const mecab_node_t *node); 00275 #endif 00276 00277 bool empty() const; 00278 void clear(); 00279 void clear_chunk(); 00280 00281 size_t chunk_size() const; 00282 size_t token_size() const; 00283 size_t size() const; 00284 00285 const char *toString(FormatType output_format); 00286 00287 #ifndef SWIG 00288 const char *toString(FormatType output_format, 00289 char *output, size_t length) const; 00290 #endif 00291 00292 CharsetType charset() const { return charset_; } 00293 void set_charset(CharsetType charset) { charset_ = charset; } 00294 PossetType posset() const { return posset_; } 00295 void set_posset(PossetType posset) { posset_ = posset; } 00296 OutputLayerType output_layer() const { return output_layer_; } 00297 void set_output_layer(OutputLayerType output_layer) { output_layer_ = output_layer; } 00298 00299 const char *what(); 00300 00301 explicit Tree(); 00302 virtual ~Tree(); 00303 00304 private: 00305 TreeAllocator *tree_allocator_; 00306 CharsetType charset_; 00307 PossetType posset_; 00308 OutputLayerType output_layer_; 00309 }; 00310 00311 class Parser { 00312 public: 00313 virtual const Tree *parse(const char *input) = 0; 00314 virtual const char *parseToString(const char *input) = 0; 00315 virtual const Tree *parse(Tree *tree) const = 0; 00316 00317 #ifndef SWIG 00318 virtual const Tree *parse(const char *input, size_t length) = 0; 00319 virtual const char *parseToString(const char *input, size_t length) = 0; 00320 virtual const char *parseToString(const char *input, size_t length, 00321 char *output, size_t output_length) = 0; 00322 #endif 00323 00324 virtual const char *what() = 0; 00325 static const char *version(); 00326 00327 virtual ~Parser() {} 00328 00329 #ifndef SWIG 00330 static Parser *create(int argc, char **argv); 00331 static Parser *create(const char *arg); 00332 #endif 00333 }; 00334 00335 CABOCHA_DLL_EXTERN Parser *createParser(int argc, char **argv); 00336 CABOCHA_DLL_EXTERN Parser *createParser(const char *arg); 00337 CABOCHA_DLL_EXTERN const char *getParserError(); 00338 CABOCHA_DLL_EXTERN const char *getLastError(); 00339 } 00340 #endif 00341 #endif