1
22
23 package com.liferay.portal.lucene;
24
25 import com.liferay.portal.kernel.util.CharPool;
26 import com.liferay.portal.kernel.util.GetterUtil;
27 import com.liferay.portal.kernel.util.StringMaker;
28 import com.liferay.portal.kernel.util.StringPool;
29 import com.liferay.portal.kernel.util.Validator;
30 import com.liferay.portal.util.PropsValues;
31
32 import java.io.BufferedInputStream;
33 import java.io.BufferedReader;
34 import java.io.ByteArrayInputStream;
35 import java.io.File;
36 import java.io.FileInputStream;
37 import java.io.IOException;
38 import java.io.InputStream;
39
40 import org.apache.commons.logging.Log;
41 import org.apache.commons.logging.LogFactory;
42 import org.apache.jackrabbit.extractor.HTMLTextExtractor;
43 import org.apache.jackrabbit.extractor.MsExcelTextExtractor;
44 import org.apache.jackrabbit.extractor.MsPowerPointTextExtractor;
45 import org.apache.jackrabbit.extractor.MsWordTextExtractor;
46 import org.apache.jackrabbit.extractor.OpenOfficeTextExtractor;
47 import org.apache.jackrabbit.extractor.PdfTextExtractor;
48 import org.apache.jackrabbit.extractor.PlainTextExtractor;
49 import org.apache.jackrabbit.extractor.RTFTextExtractor;
50 import org.apache.jackrabbit.extractor.TextExtractor;
51 import org.apache.jackrabbit.extractor.XMLTextExtractor;
52 import org.apache.lucene.document.Field;
53
54
60 public class LuceneFileExtractor {
61
62 public Field getFile(String field, InputStream is, String fileExt) {
63 String text = null;
64
65 try {
66 fileExt = GetterUtil.getString(fileExt).toLowerCase();
67
68 TextExtractor extractor = null;
69
70 String contentType = null;
71 String encoding = System.getProperty("encoding");
72
73 if (fileExt.equals(".doc")) {
74 extractor = new MsWordTextExtractor();
75
76 contentType = "application/vnd.ms-word";
77 }
78 else if (fileExt.equals(".htm") || fileExt.equals(".html")) {
79 extractor = new HTMLTextExtractor();
80
81 contentType = "text/html";
82 }
83 else if (fileExt.equals(".odb") || fileExt.equals(".odf") ||
84 fileExt.equals(".odg") || fileExt.equals(".odp") ||
85 fileExt.equals(".ods") || fileExt.equals(".odt")) {
86
87 extractor = new OpenOfficeTextExtractor();
88
89 contentType = "application/vnd.oasis.opendocument.";
90
91 if (fileExt.equals(".odb")) {
92 contentType += "database";
93 }
94 else if (fileExt.equals(".odf")) {
95 contentType += "formula";
96 }
97 else if (fileExt.equals(".odg")) {
98 contentType += "graphics";
99 }
100 else if (fileExt.equals(".odp")) {
101 contentType += "presentation";
102 }
103 else if (fileExt.equals(".ods")) {
104 contentType += "spreadsheet";
105 }
106 else if (fileExt.equals(".odt")) {
107 contentType += "text";
108 }
109 }
110 else if (fileExt.equals(".pdf")) {
111 extractor = new PdfTextExtractor();
112
113 contentType = "application/pdf";
114 }
115 else if (fileExt.equals(".ppt")) {
116 extractor = new MsPowerPointTextExtractor();
117
118 contentType = "application/vnd.ms-powerpoint";
119 }
120 else if (fileExt.equals(".rtf")) {
121 extractor = new RTFTextExtractor();
122
123 contentType = "application/rtf";
124 }
125 else if (fileExt.equals(".txt")) {
126 extractor = new PlainTextExtractor();
127
128 contentType = "text/plain";
129 }
130 else if (fileExt.equals(".xls")) {
131 extractor = new MsExcelTextExtractor();
132
133 contentType = "application/vnd.ms-excel";
134 }
135 else if (fileExt.equals(".xml")) {
136 extractor = new XMLTextExtractor();
137
138 contentType = "text/xml";
139 }
140
141 if (extractor != null) {
142 if (_log.isInfoEnabled()) {
143 _log.info(
144 "Using extractor " + extractor.getClass().getName() +
145 " for extension " + fileExt);
146 }
147
148 StringMaker sm = new StringMaker();
149
150 BufferedReader reader = new BufferedReader(
151 extractor.extractText(is, contentType, encoding));
152
153 int i;
154
155 while ((i = reader.read()) != -1) {
156 sm.append((char)i);
157 }
158
159 reader.close();
160
161 text = sm.toString();
162
163 if (Validator.isNotNull(
164 PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
165
166 text = regexpStrip(text);
167 }
168 }
169 else {
170 if (_log.isInfoEnabled()) {
171 _log.info("No extractor found for extension " + fileExt);
172 }
173 }
174 }
175 catch (Exception e) {
176 _log.error(e);
177 }
178
179 if (_log.isDebugEnabled()) {
180 _log.debug("Extractor returned text:\n\n" + text);
181 }
182
183 if (text == null) {
184 text = StringPool.BLANK;
185 }
186
187 return LuceneFields.getText(field, text);
188 }
189
190 public Field getFile(String field, byte[] byteArray, String fileExt)
191 throws IOException {
192
193 InputStream in = new BufferedInputStream(
194 new ByteArrayInputStream(byteArray));
195
196 return getFile(field, in, fileExt);
197 }
198
199 public Field getFile(String field, File file, String fileExt)
200 throws IOException {
201
202 InputStream in = new FileInputStream(file);
203
204 return getFile(field, in, fileExt);
205 }
206
207 protected String regexpStrip(String text) {
208 char[] array = text.toCharArray();
209
210 for (int i = 0; i < array.length; i++) {
211 String s = String.valueOf(array[i]);
212
213 if (!s.matches(PropsValues.LUCENE_FILE_EXTRACTOR_REGEXP_STRIP)) {
214 array[i] = CharPool.SPACE;
215 }
216 }
217
218 return new String(array);
219 }
220
221 private static Log _log = LogFactory.getLog(LuceneFileExtractor.class);
222
223 }