View Javadoc

1   /*
2    * Created on 07-Feb-2005
3    */
4   package uk.ac.roe.antigen.utils;
5   
6   import java.io.IOException;
7   import java.io.Reader;
8   import java.io.StringReader;
9   import java.util.HashMap;
10  import java.util.Map;
11  
12  import javax.swing.text.MutableAttributeSet;
13  import javax.swing.text.html.HTML;
14  import javax.swing.text.html.HTMLEditorKit;
15  import javax.swing.text.html.parser.ParserDelegator;
16  
17  public class HtmlToTextParser {
18  
19  	private TagRemovalParserCallback parserCallBack = new TagRemovalParserCallback();;
20  
21  	private ParserDelegator parser = new ParserDelegator();
22  
23  	private StringBuffer contentBuffer;;
24  
25  	/***
26  	 * @param input
27  	 * @throws IOException
28  	 */
29  	public String parse(Reader input) throws IOException {
30  		contentBuffer = new StringBuffer();
31  		parser.parse(input, parserCallBack, false);
32  		return contentBuffer.toString();
33  	}
34  
35  	/***
36  	 * Simple test
37  	 * 
38  	 * @param args
39  	 *            ignored
40  	 * @throws IOException
41  	 */
42  	public static void main(String[] args) throws IOException {
43  		String htmlText = "<html><head></head><body>" + "<h1>Heading 1</h1>"
44  				+ "<h2>Heading 2</h2>" + "Some <b>bold</b> test and a new<br>"
45  				+ "line in <em>italics</em>" + "<p>A separate paragraph</p>"
46  				+ "separated by a <hr> line, "
47                  + "a <a href='http://www.astrogrid.org'>link</a>, "
48                  + "and a <h3>third heading</h3> to finish.";
49  
50  		Reader input = new StringReader(htmlText);
51  		HtmlToTextParser parser = new HtmlToTextParser();
52  		String output = parser.parse(input);
53  		System.out.println(output);
54  
55  	}
56  
57  	private class TagRemovalParserCallback extends HTMLEditorKit.ParserCallback {
58  
59          private Map headings = new HashMap();
60          
61          public TagRemovalParserCallback() {
62           headings.put(HTML.Tag.H1,"=");
63           headings.put(HTML.Tag.H2,"-");
64           headings.put(HTML.Tag.H3,".");
65          }
66          /***
67           * Keep track of the number of chars in a heading
68           */
69          private int charCount=0;
70          private int indentationLevel=0;
71          
72  		private static final int LINELENGTH = 40;
73  
74  		private static final char BOLDCHAR = '*';
75  
76  		private static final char ITALCHAR = '_';
77  
78  		public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attrs,
79  				int pos) {
80  			if (tag == HTML.Tag.BR || tag == HTML.Tag.P) {
81  				contentBuffer.append("\n");
82  			}
83  			if (tag == HTML.Tag.HR) {
84  				contentBuffer.append("\n");
85  				for (int i = 0; i < LINELENGTH; ++i) {
86  					contentBuffer.append("_");
87  				}
88  				contentBuffer.append("\n");
89  			}
90              
91  		}
92  
93  		public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrs,
94  				int pos) {
95  			if (tag == HTML.Tag.B) {
96  				contentBuffer.append(BOLDCHAR);
97  			}
98  			if (tag == HTML.Tag.EM) {
99  				contentBuffer.append(ITALCHAR);
100 			}
101 			if (tag == HTML.Tag.P) {
102 				contentBuffer.append('\n');
103 			}
104             if (headings.containsKey(tag)) {
105                 contentBuffer.append('\n');
106             	charCount = 0;
107             }
108             if (tag == HTML.Tag.A) {
109                 String link = (String) attrs.getAttribute(HTML.Attribute.HREF);
110                 contentBuffer.append("["+link+"]");
111             }
112             if (tag == HTML.Tag.LI) {
113                 contentBuffer.append("\n");
114                 for (int i=0;i<indentationLevel;++i) {
115                  contentBuffer.append("   ");   
116                 }
117                 contentBuffer.append("o ");   
118             }
119             if (tag == HTML.Tag.UL) {
120                 indentationLevel++;   
121             }
122 		}
123 
124 		public void handleEndTag(HTML.Tag tag, int pos) {
125 			if (tag == HTML.Tag.B) {
126 				contentBuffer.append(BOLDCHAR);
127 			}
128 			if (tag == HTML.Tag.EM) {
129 				contentBuffer.append(ITALCHAR);
130 			}
131 			if (tag == HTML.Tag.P) {
132 				contentBuffer.append('\n');
133 			}
134             if (headings.containsKey(tag)) {
135                 contentBuffer.append('\n');
136                 for (int i=0;i<charCount;++i) {
137                  contentBuffer.append((String)headings.get(tag));   
138                 }
139                 charCount = 0;
140                 contentBuffer.append('\n');
141             }
142             
143             if (tag == HTML.Tag.UL) {
144                 indentationLevel--;
145                 contentBuffer.append('\n');
146             }
147 		}
148 
149 		public void handleText(char[] data, int pos) {
150 			contentBuffer.append(data);
151             charCount+=data.length;
152 		}
153 
154 	}
155 
156 }