1   /**
2    * Copyright (c) 2000-2008 Liferay, Inc. All rights reserved.
3    *
4    * Permission is hereby granted, free of charge, to any person obtaining a copy
5    * of this software and associated documentation files (the "Software"), to deal
6    * in the Software without restriction, including without limitation the rights
7    * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8    * copies of the Software, and to permit persons to whom the Software is
9    * furnished to do so, subject to the following conditions:
10   *
11   * The above copyright notice and this permission notice shall be included in
12   * all copies or substantial portions of the Software.
13   *
14   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16   * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17   * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18   * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19   * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20   * SOFTWARE.
21   */
22  
23  package com.liferay.portlet.wiki.importers.mediawiki;
24  
25  import com.liferay.documentlibrary.service.DLLocalServiceUtil;
26  import com.liferay.portal.NoSuchUserException;
27  import com.liferay.portal.PortalException;
28  import com.liferay.portal.SystemException;
29  import com.liferay.portal.kernel.util.ArrayUtil;
30  import com.liferay.portal.kernel.util.ObjectValuePair;
31  import com.liferay.portal.kernel.util.ProgressTracker;
32  import com.liferay.portal.kernel.util.ProgressTrackerThreadLocal;
33  import com.liferay.portal.kernel.util.StringPool;
34  import com.liferay.portal.kernel.util.StringUtil;
35  import com.liferay.portal.kernel.util.Validator;
36  import com.liferay.portal.kernel.xml.Document;
37  import com.liferay.portal.kernel.xml.DocumentException;
38  import com.liferay.portal.kernel.xml.Element;
39  import com.liferay.portal.kernel.xml.SAXReaderUtil;
40  import com.liferay.portal.kernel.zip.ZipReader;
41  import com.liferay.portal.model.User;
42  import com.liferay.portal.service.UserLocalServiceUtil;
43  import com.liferay.portal.util.PropsValues;
44  import com.liferay.portlet.tags.NoSuchEntryException;
45  import com.liferay.portlet.tags.model.TagsEntry;
46  import com.liferay.portlet.tags.service.TagsEntryLocalServiceUtil;
47  import com.liferay.portlet.tags.service.TagsPropertyLocalServiceUtil;
48  import com.liferay.portlet.tags.util.TagsUtil;
49  import com.liferay.portlet.wiki.ImportFilesException;
50  import com.liferay.portlet.wiki.NoSuchPageException;
51  import com.liferay.portlet.wiki.importers.WikiImporter;
52  import com.liferay.portlet.wiki.importers.WikiImporterKeys;
53  import com.liferay.portlet.wiki.model.WikiNode;
54  import com.liferay.portlet.wiki.model.WikiPage;
55  import com.liferay.portlet.wiki.model.impl.WikiPageImpl;
56  import com.liferay.portlet.wiki.service.WikiPageLocalServiceUtil;
57  import com.liferay.portlet.wiki.translators.MediaWikiToCreoleTranslator;
58  import com.liferay.util.MapUtil;
59  
60  import java.io.BufferedReader;
61  import java.io.File;
62  import java.io.FileReader;
63  import java.io.IOException;
64  
65  import java.util.ArrayList;
66  import java.util.Collections;
67  import java.util.HashMap;
68  import java.util.Iterator;
69  import java.util.List;
70  import java.util.Map;
71  import java.util.regex.Matcher;
72  import java.util.regex.Pattern;
73  
74  import org.apache.commons.logging.Log;
75  import org.apache.commons.logging.LogFactory;
76  
77  /**
78   * <a href="MediaWikiImporter.java.html"><b><i>View Source</i></b></a>
79   *
80   * @author Alvaro del Castillo
81   * @author Jorge Ferrer
82   *
83   */
84  public class MediaWikiImporter implements WikiImporter {
85  
86      public static final String SHARED_IMAGES_CONTENT = "See attachments";
87  
88      public static final String SHARED_IMAGES_TITLE = "SharedImages";
89  
90      public void importPages(
91              long userId, WikiNode node, File[] files,
92              Map<String, String[]> options)
93          throws PortalException {
94  
95          if ((files.length < 1) || (files[0] == null) || (!files[0].exists())) {
96              throw new PortalException("The pages file is mandatory");
97          }
98  
99          File pagesFile = files[0];
100         File usersFile = files[1];
101         File imagesFile = files[2];
102 
103         try {
104             Document doc = SAXReaderUtil.read(pagesFile);
105 
106             Map<String, String> usersMap = readUsersFile(usersFile);
107 
108             Element root = doc.getRootElement();
109 
110             List<String> specialNamespaces = readSpecialNamespaces(root);
111 
112             processSpecialPages(userId, node, root, specialNamespaces);
113             processRegularPages(
114                 userId, node, root, specialNamespaces, usersMap, imagesFile,
115                 options);
116             processImages(userId, node, imagesFile);
117 
118             moveFrontPage(userId, node, options);
119         }
120         catch (DocumentException de) {
121             throw new ImportFilesException("Invalid XML file provided");
122         }
123         catch (IOException de) {
124             throw new ImportFilesException("Error reading the files provided");
125         }
126         catch (PortalException e) {
127             throw e;
128         }
129         catch (Exception e) {
130             throw new PortalException(e);
131         }
132     }
133 
134     protected long getUserId(
135             long userId, WikiNode node, String author,
136             Map<String, String> usersMap)
137         throws PortalException, SystemException {
138 
139         User user = null;
140 
141         String emailAddress = usersMap.get(author);
142 
143         try {
144             if (Validator.isNull(emailAddress)) {
145                 user = UserLocalServiceUtil.getUserByScreenName(
146                     node.getCompanyId(), author.toLowerCase());
147             }
148             else {
149                 user = UserLocalServiceUtil.getUserByEmailAddress(
150                     node.getCompanyId(), emailAddress);
151             }
152         }
153         catch (NoSuchUserException nsue) {
154             user = UserLocalServiceUtil.getUserById(userId);
155         }
156 
157         return user.getUserId();
158     }
159 
160     protected void importPage(
161             long userId, String author, WikiNode node, String title,
162             String content, String summary, Map<String, String> usersMap)
163         throws PortalException {
164 
165         try {
166             long authorUserId = getUserId(userId, node, author, usersMap);
167             String parentTitle = readParentTitle(content);
168             String redirectTitle = readRedirectTitle(content);
169             String[] tagsEntries = readTagsEntries(userId, node, content);
170 
171             if (Validator.isNull(redirectTitle)) {
172                 content = _translator.translate(content);
173             }
174             else {
175                 content =
176                     StringPool.DOUBLE_OPEN_BRACKET + redirectTitle +
177                         StringPool.DOUBLE_CLOSE_BRACKET;
178             }
179 
180             WikiPage page = null;
181 
182             try {
183                 page = WikiPageLocalServiceUtil.getPage(
184                     node.getNodeId(), title);
185             }
186             catch (NoSuchPageException nspe) {
187                 page = WikiPageLocalServiceUtil.addPage(
188                     authorUserId, node.getNodeId(), title, WikiPageImpl.NEW,
189                     null, true, null, null);
190             }
191 
192             WikiPageLocalServiceUtil.updatePage(
193                 authorUserId, node.getNodeId(), title, page.getVersion(),
194                 content, summary, true, "creole", parentTitle,
195                 redirectTitle, tagsEntries, null, null);
196         }
197         catch (Exception e) {
198             throw new PortalException("Error importing page " + title, e);
199         }
200     }
201 
202     protected boolean isSpecialMediaWikiPage(
203         String title, List<String> specialNamespaces) {
204 
205         for (String namespace: specialNamespaces) {
206             if (title.startsWith(namespace + StringPool.COLON)) {
207                 return true;
208             }
209         }
210 
211         return false;
212     }
213 
214     protected boolean isValidImage(String[] paths, byte[] bytes) {
215         if (ArrayUtil.contains(_SPECIAL_MEDIA_WIKI_DIRS, paths[0])) {
216             return false;
217         }
218 
219         if ((paths.length > 1) &&
220             (ArrayUtil.contains(_SPECIAL_MEDIA_WIKI_DIRS, paths[1]))) {
221 
222             return false;
223         }
224 
225         String fileName = paths[paths.length - 1];
226 
227         try {
228             DLLocalServiceUtil.validate(fileName, bytes);
229         }
230         catch (PortalException pe) {
231             return false;
232         }
233 
234         return true;
235     }
236 
237     protected void moveFrontPage(
238         long userId, WikiNode node, Map<String, String[]> options) {
239 
240         String frontPageTitle = MapUtil.getString(
241             options, WikiImporterKeys.OPTIONS_FRONT_PAGE);
242 
243         if (Validator.isNotNull(frontPageTitle)) {
244             frontPageTitle = normalizeTitle(frontPageTitle);
245 
246             try {
247                 if (WikiPageLocalServiceUtil.getPagesCount(
248                         node.getNodeId(), frontPageTitle, true) > 0) {
249 
250                     WikiPageLocalServiceUtil.movePage(
251                         userId, node.getNodeId(), frontPageTitle,
252                         WikiPageImpl.FRONT_PAGE, false, null, null);
253 
254                 }
255             }
256             catch (Exception e) {
257                 if (_log.isWarnEnabled()) {
258                     StringBuilder sb = new StringBuilder();
259 
260                     sb.append("Could not move ");
261                     sb.append(WikiPageImpl.FRONT_PAGE);
262                     sb.append(" to the title provided: ");
263                     sb.append(frontPageTitle);
264 
265                     _log.warn(sb.toString(), e);
266                 }
267             }
268 
269         }
270 
271     }
272 
273     protected String normalize(String categoryName, int length) {
274         categoryName = TagsUtil.toWord(categoryName.trim());
275 
276         return StringUtil.shorten(categoryName, length);
277     }
278 
279     protected String normalizeDescription(String description) {
280         description = description.replaceAll(
281             _categoriesPattern.pattern(), StringPool.BLANK);
282 
283         return normalize(description, 300);
284     }
285 
286     protected String normalizeTitle(String title) {
287         title = title.replaceAll(
288             PropsValues.WIKI_PAGE_TITLES_REMOVE_REGEXP, StringPool.BLANK);
289 
290         return StringUtil.shorten(title, 75);
291     }
292 
293     private void processImages(long userId, WikiNode node, File imagesFile)
294         throws Exception {
295 
296         if ((imagesFile == null) || (!imagesFile.exists())) {
297             return;
298         }
299 
300         ProgressTracker progressTracker =
301             ProgressTrackerThreadLocal.getProgressTracker();
302 
303         int count = 0;
304 
305         ZipReader zipReader = new ZipReader(imagesFile);
306 
307         Map<String, byte[]> entries = zipReader.getEntries();
308 
309         int total = entries.size();
310 
311         if (total > 0) {
312             try {
313                 WikiPageLocalServiceUtil.getPage(
314                     node.getNodeId(), SHARED_IMAGES_TITLE);
315             }
316             catch (NoSuchPageException nspe) {
317                 WikiPageLocalServiceUtil.addPage(
318                     userId, node.getNodeId(), SHARED_IMAGES_TITLE,
319                     SHARED_IMAGES_CONTENT, null, true, null, null);
320             }
321         }
322 
323         List<ObjectValuePair<String, byte[]>> attachments =
324             new ArrayList<ObjectValuePair<String, byte[]>>();
325 
326         Iterator<Map.Entry<String, byte[]>> itr = entries.entrySet().iterator();
327 
328         int percentage = 50;
329 
330         for (int i = 0; itr.hasNext(); i++) {
331             Map.Entry<String, byte[]> entry = itr.next();
332 
333             String key = entry.getKey();
334             byte[] value = entry.getValue();
335 
336             if (key.endsWith(StringPool.SLASH)) {
337                 if (_log.isInfoEnabled()) {
338                     _log.info("Ignoring " + key);
339                 }
340 
341                 continue;
342             }
343 
344             String[] paths = StringUtil.split(key, StringPool.SLASH);
345 
346             if (!isValidImage(paths, value)) {
347                 if (_log.isInfoEnabled()) {
348                     _log.info("Ignoring " + key);
349                 }
350 
351                 continue;
352             }
353 
354             String fileName = paths[paths.length - 1].toLowerCase();
355 
356             attachments.add(
357                 new ObjectValuePair<String, byte[]>(fileName, value));
358 
359             count++;
360 
361             if ((i % 5) == 0) {
362                 WikiPageLocalServiceUtil.addPageAttachments(
363                     node.getNodeId(), SHARED_IMAGES_TITLE, attachments);
364 
365                 attachments.clear();
366 
367                 percentage = Math.min(50 + (i * 50) / total, 99);
368 
369                 progressTracker.updateProgress(percentage);
370             }
371         }
372 
373         if (!attachments.isEmpty()) {
374             WikiPageLocalServiceUtil.addPageAttachments(
375                 node.getNodeId(), SHARED_IMAGES_TITLE, attachments);
376         }
377 
378         if (_log.isInfoEnabled()) {
379             _log.info("Imported " + count + " images into " + node.getName());
380         }
381     }
382 
383     protected void processRegularPages(
384         long userId, WikiNode node, Element root,
385         List<String> specialNamespaces, Map<String, String> usersMap,
386         File imagesFile, Map<String, String[]> options) {
387 
388         boolean importLatestVersion = MapUtil.getBoolean(
389             options, WikiImporterKeys.OPTIONS_IMPORT_LATEST_VERSION);
390 
391         ProgressTracker progressTracker =
392             ProgressTrackerThreadLocal.getProgressTracker();
393 
394         int count = 0;
395 
396         List<Element> pages = root.elements("page");
397 
398         int total = pages.size();
399 
400         Iterator<Element> itr = root.elements("page").iterator();
401 
402         int percentage = 10;
403         int maxPercentage = 50;
404 
405         if ((imagesFile == null) || (!imagesFile.exists())) {
406             maxPercentage = 99;
407         }
408 
409         int percentageRange = maxPercentage - percentage;
410 
411         for (int i = 0; itr.hasNext(); i++) {
412             Element pageEl = itr.next();
413 
414             String title = pageEl.elementText("title");
415 
416             title = normalizeTitle(title);
417 
418             percentage = Math.min(
419                 10 + (i * percentageRange) / total, maxPercentage);
420 
421             progressTracker.updateProgress(percentage);
422 
423             if (isSpecialMediaWikiPage(title, specialNamespaces)) {
424                 continue;
425             }
426 
427             List<Element> revisionEls = pageEl.elements("revision");
428 
429             if (importLatestVersion) {
430                 Element lastRevisionEl = revisionEls.get(
431                     revisionEls.size() - 1);
432 
433                 revisionEls = new ArrayList<Element>();
434 
435                 revisionEls.add(lastRevisionEl);
436             }
437 
438             for (Element curRevisionEl : revisionEls) {
439                 String author = curRevisionEl.element(
440                     "contributor").elementText("username");
441                 String content = curRevisionEl.elementText("text");
442                 String summary = curRevisionEl.elementText("comment");
443 
444                 try {
445                     importPage(
446                         userId, author, node, title, content, summary,
447                         usersMap);
448                 }
449                 catch (Exception e) {
450                     if (_log.isWarnEnabled()) {
451                         StringBuilder sb = new StringBuilder();
452 
453                         sb.append("Page with title ");
454                         sb.append(title);
455                         sb.append(" could not be imported");
456 
457                         _log.warn(sb.toString(), e);
458                     }
459                 }
460             }
461 
462             count++;
463         }
464 
465         if (_log.isInfoEnabled()) {
466             _log.info("Imported " + count + " pages into " + node.getName());
467         }
468     }
469 
470     protected void processSpecialPages(
471             long userId, WikiNode node, Element root,
472             List<String> specialNamespaces)
473         throws PortalException {
474 
475         ProgressTracker progressTracker =
476             ProgressTrackerThreadLocal.getProgressTracker();
477 
478         List<Element> pages = root.elements("page");
479 
480         int total = pages.size();
481 
482         Iterator<Element> itr = pages.iterator();
483 
484         for (int i = 0; itr.hasNext(); i++) {
485             Element page = itr.next();
486 
487             String title = page.elementText("title");
488 
489             if (!title.startsWith("Category:")) {
490                 if (isSpecialMediaWikiPage(title, specialNamespaces)) {
491                     root.remove(page);
492                 }
493 
494                 continue;
495             }
496 
497             String categoryName = title.substring("Category:".length());
498 
499             categoryName = normalize(categoryName, 75);
500 
501             String description = page.element("revision").elementText("text");
502 
503             description = normalizeDescription(description);
504 
505             try {
506                 TagsEntry tagsEntry = null;
507 
508                 try {
509                     tagsEntry = TagsEntryLocalServiceUtil.getEntry(
510                         node.getCompanyId(), categoryName);
511                 }
512                 catch (NoSuchEntryException nsee) {
513                     tagsEntry = TagsEntryLocalServiceUtil.addEntry(
514                         userId, categoryName);
515                 }
516 
517                 if (Validator.isNotNull(description)) {
518                     TagsPropertyLocalServiceUtil.addProperty(
519                         userId, tagsEntry.getEntryId(), "description",
520                         description);
521                 }
522             }
523             catch (SystemException se) {
524                  _log.error(se, se);
525             }
526 
527             if ((i % 5) == 0) {
528                 progressTracker.updateProgress((i * 10) / total);
529             }
530         }
531     }
532 
533     protected String readParentTitle(String content) {
534         Matcher matcher = _parentPattern.matcher(content);
535 
536         String redirectTitle = StringPool.BLANK;
537 
538         if (matcher.find()) {
539             redirectTitle = matcher.group(1);
540 
541             redirectTitle = normalizeTitle(redirectTitle);
542 
543             redirectTitle += " (disambiguation)";
544         }
545 
546         return redirectTitle;
547     }
548 
549     protected String readRedirectTitle(String content) {
550         Matcher matcher = _redirectPattern.matcher(content);
551 
552         String redirectTitle = StringPool.BLANK;
553 
554         if (matcher.find()) {
555             redirectTitle = matcher.group(1);
556 
557             redirectTitle = normalizeTitle(redirectTitle);
558         }
559 
560         return redirectTitle;
561     }
562 
563     protected List<String> readSpecialNamespaces(Element root)
564         throws ImportFilesException {
565 
566         List<String> namespaces = new ArrayList<String>();
567 
568         Element siteinfoEl = root.element("siteinfo");
569 
570         if (siteinfoEl == null) {
571             throw new ImportFilesException("Invalid pages XML file");
572         }
573 
574         Iterator<Element> itr = siteinfoEl.element(
575             "namespaces").elements("namespace").iterator();
576 
577         while (itr.hasNext()) {
578             Element namespace = itr.next();
579 
580             if (!namespace.attribute("key").equals("0")) {
581                 namespaces.add(namespace.getText());
582             }
583         }
584 
585         return namespaces;
586     }
587 
588     protected String[] readTagsEntries(
589             long userId, WikiNode node, String content)
590         throws PortalException, SystemException {
591 
592         Matcher matcher = _categoriesPattern.matcher(content);
593 
594         List<String> tagsEntries = new ArrayList<String>();
595 
596         while (matcher.find()) {
597             String categoryName = matcher.group(1);
598 
599             categoryName = normalize(categoryName, 75);
600 
601             TagsEntry tagsEntry = null;
602 
603             try {
604                 tagsEntry = TagsEntryLocalServiceUtil.getEntry(
605                     node.getCompanyId(), categoryName);
606             }
607             catch (NoSuchEntryException nsee) {
608                 tagsEntry = TagsEntryLocalServiceUtil.addEntry(
609                     userId, categoryName);
610             }
611 
612             tagsEntries.add(tagsEntry.getName());
613         }
614 
615         if (content.indexOf(_WORK_IN_PROGRESS) != -1) {
616             tagsEntries.add(_WORK_IN_PROGRESS_TAG);
617         }
618 
619         return tagsEntries.toArray(new String[tagsEntries.size()]);
620     }
621 
622     protected Map<String, String> readUsersFile(File usersFile)
623         throws IOException {
624 
625         if ((usersFile == null) || (!usersFile.exists())) {
626             return Collections.EMPTY_MAP;
627         }
628 
629         Map<String, String> usersMap = new HashMap<String, String>();
630 
631         BufferedReader reader = new BufferedReader(new FileReader(usersFile));
632 
633         String line = reader.readLine();
634 
635         while (line != null) {
636             String[] array = StringUtil.split(line);
637 
638             if ((array.length == 2) && (Validator.isNotNull(array[0])) &&
639                 (Validator.isNotNull(array[1]))) {
640 
641                 usersMap.put(array[0], array[1]);
642             }
643             else {
644                 if (_log.isInfoEnabled()) {
645                     _log.info(
646                         "Ignoring line " + line +
647                             " because it does not contain exactly 2 columns");
648                 }
649             }
650 
651             line = reader.readLine();
652         }
653 
654         return usersMap;
655     }
656 
657     private static final String[] _SPECIAL_MEDIA_WIKI_DIRS = new String[]{
658         "thumb", "temp", "archive"};
659 
660     private static final String _WORK_IN_PROGRESS = "{{Work in progress}}";
661 
662     private static final String _WORK_IN_PROGRESS_TAG = "work in progress";
663 
664     private static Log _log = LogFactory.getLog(MediaWikiImporter.class);
665 
666     private static Pattern _categoriesPattern = Pattern.compile(
667         "\\[\\[[Cc]ategory:([^\\]]*)\\]\\][\\n]*");
668 
669     private static Pattern _parentPattern = Pattern.compile(
670         "\\{{2}OtherTopics\\|([^\\}]*)\\}{2}");
671 
672     private static Pattern _redirectPattern = Pattern.compile(
673         "#REDIRECT \\[\\[([^\\]]*)\\]\\]");
674 
675     private MediaWikiToCreoleTranslator _translator =
676         new MediaWikiToCreoleTranslator();
677 
678 }