C0 code coverage information
Generated on Fri Jul 11 15:55:29 -0700 2008 with rcov 0.7.0
Code reported as executed by Ruby looks like this...
and this: this line is also marked as covered.
Lines considered as run by rcov, but not reported by Ruby, look like this,
and this: these lines were inferred by rcov (using simple heuristics).
Finally, here's a line marked as not executed.
1 # = HTMLTokenizer
2 #
3 # Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com)
4 # Copyright:: Copyright (c) 2004 Ben Giddings
5 # License:: Distributes under the same terms as Ruby
6 #
7 #
8 # This is a partial port of the functionality behind Perl's TokeParser
9 # Provided a page it progressively returns tokens from that page
10 #
11 # $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $
12
13 #
14 # A class to tokenize HTML.
15 #
16 # Example:
17 #
18 # page = "<HTML>
19 # <HEAD>
20 # <TITLE>This is the title</TITLE>
21 # </HEAD>
22 # <!-- Here comes the <a href=\"missing.link\">blah</a>
23 # comment body
24 # -->
25 # <BODY>
26 # <H1>This is the header</H1>
27 # <P>
28 # This is the paragraph, it contains
29 # <a href=\"link.html\">links</a>,
30 # <img src=\"blah.gif\" optional alt='images
31 # are
32 # really cool'>. Ok, here is some more text and
33 # <A href=\"http://another.link.com/\" target=\"_blank\">another link</A>.
34 # </P>
35 # </body>
36 # </HTML>
37 # "
38 # toke = HTMLTokenizer.new(page)
39 #
40 # assert("<h1>" == toke.getTag("h1", "h2", "h3").to_s.downcase)
41 # assert(HTMLTag.new("<a href=\"link.html\">") == toke.getTag("IMG", "A"))
42 # assert("links" == toke.getTrimmedText)
43 # assert(toke.getTag("IMG", "A").attr_hash['optional'])
44 # assert("_blank" == toke.getTag("IMG", "A").attr_hash['target'])
45 #
46 class HTMLTokenizer
47 @@version = 1.0
48
49 # Get version of HTMLTokenizer lib
50 def self.version
51 @@version
52 end
53
54 attr_reader :page
55
56 # Create a new tokenizer, based on the content, used as a string.
57 def initialize(content)
58 @page = content.to_s
59 @cur_pos = 0
60 end
61
62 # Reset the parser, setting the current position back at the stop
63 def reset
64 @cur_pos = 0
65 end
66
67 # Look at the next token, but don't actually grab it
68 def peekNextToken
69 if @cur_pos == @page.length then return nil end
70
71 if ?< == @page[@cur_pos]
72 # Next token is a tag of some kind
73 if '!--' == @page[(@cur_pos + 1), 3]
74 # Token is a comment
75 tag_end = @page.index('-->', (@cur_pos + 1))
76 if tag_end.nil?
77 raise HTMLTokenizerError, "No end found to started comment:\n#{@page[@cur_pos,80]}"
78 end
79 # p @page[@cur_pos .. (tag_end+2)]
80 HTMLComment.new(@page[@cur_pos .. (tag_end + 2)])
81 else
82 # Token is a html tag
83 tag_end = @page.index('>', (@cur_pos + 1))
84 if tag_end.nil?
85 raise HTMLTokenizerError, "No end found to started tag:\n#{@page[@cur_pos,80]}"
86 end
87 # p @page[@cur_pos .. tag_end]
88 HTMLTag.new(@page[@cur_pos .. tag_end])
89 end
90 else
91 # Next token is text
92 text_end = @page.index('<', @cur_pos)
93 text_end = text_end.nil? ? -1 : (text_end - 1)
94 # p @page[@cur_pos .. text_end]
95 HTMLText.new(@page[@cur_pos .. text_end])
96 end
97 end
98
99 # Get the next token, returns an instance of
100 # * HTMLText
101 # * HTMLToken
102 # * HTMLTag
103 def getNextToken
104 token = peekNextToken
105 if token
106 # @page = @page[token.raw.length .. -1]
107 # @page.slice!(0, token.raw.length)
108 @cur_pos += token.raw.length
109 end
110 #p token
111 #print token.raw
112 return token
113 end
114
115 # Get a tag from the specified set of desired tags.
116 # For example:
117 # <tt>foo = toke.getTag("h1", "h2", "h3")</tt>
118 # Will return the next header tag encountered.
119 def getTag(*sought_tags)
120 sought_tags.collect! {|elm| elm.downcase}
121
122 while (tag = getNextToken)
123 if tag.kind_of?(HTMLTag) and
124 (0 == sought_tags.length or sought_tags.include?(tag.tag_name))
125 break
126 end
127 end
128 tag
129 end
130
131 # Get all the text between the current position and the next tag
132 # (if specified) or a specific later tag
133 def getText(until_tag = nil)
134 if until_tag.nil?
135 if ?< == @page[@cur_pos]
136 # Next token is a tag, not text
137 ""
138 else
139 # Next token is text
140 getNextToken.text
141 end
142 else
143 ret_str = ""
144
145 while (tag = peekNextToken)
146 if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag
147 break
148 end
149
150 if ("" != tag.text)
151 ret_str << (tag.text + " ")
152 end
153 getNextToken
154 end
155
156 ret_str
157 end
158 end
159
160 # Like getText, but squeeze all whitespace, getting rid of
161 # leading and trailing whitespace, and squeezing multiple
162 # spaces into a single space.
163 def getTrimmedText(until_tag = nil)
164 getText(until_tag).strip.gsub(/\s+/m, " ")
165 end
166
167 end
168
169 class HTMLTokenizerError < Exception
170 end
171
172 # The parent class for all three types of HTML tokens
173 class HTMLToken
174 attr_accessor :raw
175
176 # Initialize the token based on the raw text
177 def initialize(text)
178 @raw = text
179 end
180
181 # By default, return exactly the string used to create the text
182 def to_s
183 raw
184 end
185
186 # By default tokens have no text representation
187 def text
188 ""
189 end
190
191 def trimmed_text
192 text.strip.gsub(/\s+/m, " ")
193 end
194
195 # Compare to another based on the raw source
196 def ==(other)
197 raw == other.to_s
198 end
199 end
200
201 # Class representing text that isn't inside a tag
202 class HTMLText < HTMLToken
203 def text
204 raw
205 end
206 end
207
208 # Class representing an HTML comment
209 class HTMLComment < HTMLToken
210 attr_accessor :contents
211 def initialize(text)
212 super(text)
213 temp_arr = text.scan(/^<!--\s*(.*?)\s*-->$/m)
214 if temp_arr[0].nil?
215 raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
216 end
217
218 @contents = temp_arr[0][0]
219 end
220 end
221
222 # Class representing an HTML tag
223 class HTMLTag < HTMLToken
224 attr_reader :end_tag, :tag_name
225 def initialize(text)
226 super(text)
227 if ?< != text[0] or ?> != text[-1]
228 raise HTMLTokenizerError, "Text passed to HTMLComment.initialize is not a comment"
229 end
230
231 @attr_hash = Hash.new
232 @raw = text
233
234 tag_name = text.scan(/[\w:-]+/)[0]
235 if tag_name.nil?
236 raise HTMLTokenizerError, "Error, tag is nil: #{tag_name}"
237 end
238
239 if ?/ == text[1]
240 # It's an end tag
241 @end_tag = true
242 @tag_name = '/' + tag_name.downcase
243 else
244 @end_tag = false
245 @tag_name = tag_name.downcase
246 end
247
248 @hashed = false
249 end
250
251 # Retrieve a hash of all the tag's attributes.
252 # Lazily done, so that if you don't look at a tag's attributes
253 # things go quicker
254 def attr_hash
255 # Lazy initialize == don't build the hash until it's needed
256 if !@hashed
257 if !@end_tag
258 # Get the attributes
259 attr_arr = @raw.scan(/<[\w:-]+\s+(.*?)\/?>/m)[0]
260 if attr_arr.kind_of?(Array)
261 # Attributes found, parse them
262 attrs = attr_arr[0]
263 attr_arr = attrs.scan(/\s*([\w:-]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'>][^\s>]*)))?/m)
264 # clean up the array by:
265 # * setting all nil elements to true
266 # * removing enclosing quotes
267 attr_arr.each {
268 |item|
269 val = if item[1].nil?
270 item[0]
271 elsif '"'[0] == item[1][0] or '\''[0] == item[1][0]
272 item[1][1 .. -2]
273 else
274 item[1]
275 end
276 @attr_hash[item[0].downcase] = val
277 }
278 end
279 end
280 @hashed = true
281 end
282
283 #p self
284
285 @attr_hash
286 end
287
288 # Get the 'alt' text for a tag, if it exists, or an empty string otherwise
289 def text
290 if !end_tag
291 case tag_name
292 when 'img'
293 if !attr_hash['alt'].nil?
294 return attr_hash['alt']
295 end
296 when 'applet'
297 if !attr_hash['alt'].nil?
298 return attr_hash['alt']
299 end
300 end
301 end
302 return ''
303 end
304 end
305
Generated using the rcov code coverage analysis tool for Ruby version 0.7.0.