001 package org.maltparser.core.syntaxgraph.reader;
002
003 import java.io.BufferedReader;
004 import java.io.FileInputStream;
005 import java.io.FileNotFoundException;
006 import java.io.IOException;
007 import java.io.InputStream;
008 import java.io.InputStreamReader;
009 import java.io.UnsupportedEncodingException;
010 import java.net.URL;
011 import java.util.Iterator;
012 import java.util.SortedMap;
013
014 import org.maltparser.core.exception.MaltChainedException;
015 import org.maltparser.core.io.dataformat.ColumnDescription;
016 import org.maltparser.core.io.dataformat.DataFormatException;
017 import org.maltparser.core.io.dataformat.DataFormatInstance;
018 import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph;
019 import org.maltparser.core.syntaxgraph.PhraseStructure;
020 import org.maltparser.core.syntaxgraph.TokenStructure;
021 import org.maltparser.core.syntaxgraph.edge.Edge;
022 import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
023 import org.maltparser.core.syntaxgraph.node.TokenNode;
024 /**
025 *
026 *
027 * @author Johan Hall
028 */
029 public class BracketReader implements SyntaxGraphReader {
030 private BufferedReader reader;
031 private DataFormatInstance dataFormatInstance;
032 private int sentenceCount;
033 private StringBuilder input;
034 private int terminalCounter;
035 private int nonTerminalCounter;
036 private String optionString;
037 private SortedMap<String,ColumnDescription> inputColumns;
038 private SortedMap<String,ColumnDescription> edgeLabelColumns;
039 private SortedMap<String,ColumnDescription> phraseLabelColumns;
040
041 private String fileName = null;
042 private URL url = null;
043 private String charsetName;
044 private int nIterations;
045 private int cIterations;
046 private boolean closeStream = true;
047
048 private char STARTING_BRACKET = '(';
049 private char CLOSING_BRACKET = ')';
050 private char INPUT_SEPARATOR = ' ';
051 private char EDGELABEL_SEPARATOR = '-';
052 private char SENTENCE_SEPARATOR = '\n';
053 private char BLANK = ' ';
054 private char CARRIAGE_RETURN = '\r';
055 private char TAB = '\t';
056
057 public BracketReader() {
058 input = new StringBuilder();
059 nIterations = 1;
060 cIterations = 1;
061 }
062
063 private void reopen() throws MaltChainedException {
064 close();
065 if (fileName != null) {
066 open(fileName, charsetName);
067 } else if (url != null) {
068 open(url, charsetName);
069 } else {
070 throw new DataFormatException("The input stream cannot be reopen. ");
071 }
072 }
073
074 public void open(String fileName, String charsetName) throws MaltChainedException {
075 setFileName(fileName);
076 setCharsetName(charsetName);
077 try {
078 open(new FileInputStream(fileName), charsetName);
079 }catch (FileNotFoundException e) {
080 throw new DataFormatException("The input file '"+fileName+"' cannot be found. ", e);
081 }
082 }
083 public void open(URL url, String charsetName) throws MaltChainedException {
084 setUrl(url);
085 setCharsetName(charsetName);
086 try {
087 open(url.openStream(), charsetName);
088 } catch (IOException e) {
089 throw new DataFormatException("The URL '"+url.toString()+"' cannot be opened. ", e);
090 }
091 }
092
093 public void open(InputStream is, String charsetName) throws MaltChainedException {
094 try {
095 if (is == System.in) {
096 closeStream = false;
097 }
098 open(new InputStreamReader(is, charsetName));
099 } catch (UnsupportedEncodingException e) {
100 throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported. ", e);
101 }
102 }
103
104 private void open(InputStreamReader isr) throws MaltChainedException {
105 setReader(new BufferedReader(isr));
106 setSentenceCount(0);
107 }
108
109 public void readProlog() throws MaltChainedException {
110
111 }
112
113 public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException {
114 if (syntaxGraph == null || dataFormatInstance == null) {
115 return false;
116 }
117 syntaxGraph.clear();
118 int brackets = 0;
119 try {
120 int l = reader.read();
121 char c;
122 input.setLength(0);
123
124 while (true) {
125 if (l == -1) {
126 input.setLength(0);
127 return false;
128 }
129
130 c = (char)l;
131 l = reader.read();
132
133 if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) {
134
135 } else if (c == STARTING_BRACKET) {
136 input.append(c);
137 brackets++;
138 } else if (c == CLOSING_BRACKET) {
139 input.append(c);
140 brackets--;
141 } else if (c == INPUT_SEPARATOR) {
142 if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) {
143 input.append(c);
144 }
145 // Start BracketProgLangReader
146 } else if (c == '\\') {
147 c = (char) l;
148 l = reader.read();
149 if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') {
150 System.out.println("Error");
151 System.exit(1);
152 } else {
153 input.append("\\" + c);
154 }
155 // End BracketProgLangReader
156 } else if (brackets != 0){
157 input.append(c);
158 }
159 if (brackets == 0 && input.length() != 0) {
160 sentenceCount++;
161 terminalCounter = 1;
162 nonTerminalCounter = 1;
163 if (syntaxGraph instanceof PhraseStructure) {
164 bracketing((PhraseStructure)syntaxGraph, 0, input.length(), null);
165 if (syntaxGraph instanceof MappablePhraseStructureGraph) {
166 ((MappablePhraseStructureGraph)syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph)syntaxGraph), ((PhraseStructure)syntaxGraph).getPhraseStructureRoot());
167 }
168 }
169 return true;
170 }
171
172 if (c == -1) {
173 if (brackets != 0) {
174 close();
175 throw new MaltChainedException("Error when reading from the input file. ");
176 }
177 if (cIterations < nIterations) {
178 cIterations++;
179 reopen();
180 return true;
181 }
182 return false;
183 }
184 }
185 } catch (IOException e) {
186 close();
187 throw new MaltChainedException("Error when reading from the input file. ", e);
188 }
189
190 }
191
192 private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException {
193 int bracketsdepth = 0;
194 int startpos = start-1;
195 for (int i = start, n = end; i < n; i++) {
196 if (input.charAt(i) == STARTING_BRACKET
197 // Start BracketProgLangReader
198 && (i == 0 || input.charAt(i - 1) != '\\')
199 // end BracketProgLangReader
200
201 ) {
202 if (bracketsdepth == 0) {
203 startpos = i;
204 }
205 bracketsdepth++;
206 } else if (input.charAt(i) == CLOSING_BRACKET
207 // Start BracketProgLangReader
208 && (i == 0 || input.charAt(i - 1) != '\\')
209 // end BracketProgLangReader
210 ) {
211 bracketsdepth--;
212 if (bracketsdepth == 0) {
213 extract(phraseStructure, startpos+1, i, parent);
214 }
215 }
216 }
217 }
218
219 private void extract(PhraseStructure phraseStructure, int begin, int end, PhraseStructureNode parent) throws MaltChainedException {
220 int index = -1;
221 for (int i = begin; i < end; i++) {
222 if (input.charAt(i) == STARTING_BRACKET
223 // Start BracketProgLangReader
224 && (i == begin || input.charAt(i - 1) != '\\')
225 // end BracketProgLangReader
226 ) {
227 index = i;
228 break;
229 }
230 }
231 if (index == -1) {
232 TokenNode t = phraseStructure.addTokenNode(terminalCounter);
233 if (t == null) {
234 close();
235 throw new MaltChainedException("Bracket Reader error: could not create a terminal node. ");
236 }
237
238 terminalCounter++;
239 Edge e = null;
240
241 if (parent != null) {
242 e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode)t);
243 } else {
244 close();
245 throw new MaltChainedException("Bracket Reader error: could not find the parent node. ");
246 }
247
248 int start = begin;
249
250 Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator();
251 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
252 boolean noneNode = false;
253 boolean edgeLabels = false;
254 for (int i = begin; i < end; i++) {
255 if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR
256 // Start BracketProgLangReader
257 && (i == begin || input.charAt(i - 1) != '\\')
258 // end BracketProgLangReader
259 ) || i == end - 1) {
260 if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) {
261 noneNode = true;
262 } else if (start == begin) {
263 if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) {
264 if (inputColumnsIterator.hasNext()) {
265 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(),
266
267 // Start BracketProgLangReader
268 decodeString(
269 // end BracketProgLangReader
270 (i == end - 1)?input.substring(start,end):input.substring(start, i)
271 // Start BracketProgLangReader
272 )
273 // end BracketProgLangReader
274 );
275 }
276 start = i + 1;
277 if (input.charAt(i) == EDGELABEL_SEPARATOR) {
278 edgeLabels = true;
279 }
280 }
281 } else if (edgeLabels && e != null) {
282 if (edgeLabelsColumnsIterator.hasNext()) {
283 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
284 }
285 start = i + 1;
286 if (input.charAt(i) == INPUT_SEPARATOR
287 // Start BracketProgLangReader
288 && (i == begin || input.charAt(i - 1) != '\\')
289 // end BracketProgLangReader
290 ) {
291 edgeLabels = false;
292 }
293 } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i+1) != INPUT_SEPARATOR
294 // Start BracketProgLangReader
295 && (i == begin || input.charAt(i - 1) != '\\')
296 // end BracketProgLangReader
297 )
298 ) {
299 } else {
300 if (inputColumnsIterator.hasNext()) {
301 t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1)?input.substring(start,end):input.substring(start, i));
302 }
303 start = i + 1;
304 }
305 }
306 }
307 } else {
308 PhraseStructureNode nt;
309 Edge e = null;
310 if (parent == null) {
311 nt = phraseStructure.getPhraseStructureRoot();
312 } else {
313 nt = phraseStructure.addNonTerminalNode(nonTerminalCounter);
314 if (nt == null) {
315 close();
316 throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. ");
317 }
318 nonTerminalCounter++;
319
320 e = phraseStructure.addPhraseStructureEdge(parent, nt);
321 }
322 Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator();
323 Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator();
324 int newbegin = begin;
325 int start = begin;
326
327 for (int i = begin; i < index; i++) {
328 if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) {
329 if (start == newbegin) {
330 if (phraseLabelColumnsIterator.hasNext()) {
331 nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
332 }
333 start = i + 1;
334 } else if (e != null) {
335 if (edgeLabelsColumnsIterator.hasNext()) {
336 e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1)?input.substring(start,index):input.substring(start, i));
337 }
338 start = i + 1;
339 }
340 } else if (input.charAt(i) == BLANK) {
341 start++;
342 newbegin++;
343 }
344 }
345
346 bracketing(phraseStructure, index, end, nt);
347 }
348 }
349
350 private String decodeString(String string) {
351 return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " ");
352 }
353
354 public void readEpilog() throws MaltChainedException {
355
356 }
357
358 public BufferedReader getReader() {
359 return reader;
360 }
361
362 public void setReader(BufferedReader reader) {
363 this.reader = reader;
364 }
365
366 public int getSentenceCount() throws MaltChainedException {
367 return sentenceCount;
368 }
369
370 public void setSentenceCount(int sentenceCount) {
371 this.sentenceCount = sentenceCount;
372 }
373
374 public DataFormatInstance getDataFormatInstance() {
375 return dataFormatInstance;
376 }
377
378 public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) {
379 this.dataFormatInstance = inputDataFormatInstance;
380 inputColumns = dataFormatInstance.getInputColumnDescriptions();
381 edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions();
382 phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions();
383 }
384
385 public String getOptions() {
386 return optionString;
387 }
388
389 public void setOptions(String optionString) throws MaltChainedException {
390 this.optionString = optionString;
391 }
392
393 public String getFileName() {
394 return fileName;
395 }
396
397 public void setFileName(String fileName) {
398 this.fileName = fileName;
399 }
400
401 public URL getUrl() {
402 return url;
403 }
404
405 public void setUrl(URL url) {
406 this.url = url;
407 }
408
409 public String getCharsetName() {
410 return charsetName;
411 }
412
413 public void setCharsetName(String charsetName) {
414 this.charsetName = charsetName;
415 }
416
417 public int getNIterations() {
418 return nIterations;
419 }
420
421 public void setNIterations(int iterations) {
422 nIterations = iterations;
423 }
424
425 public int getIterationCounter() {
426 return cIterations;
427 }
428
429 public void close() throws MaltChainedException {
430 try {
431 if (reader != null) {
432 if (closeStream) {
433 reader.close();
434 }
435 reader = null;
436 }
437 } catch (IOException e) {
438 throw new DataFormatException("Error when closing the input file.", e);
439 }
440 }
441 }