1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 import sys
35 import re
36 try:
37 from urlparse import urljoin
38 except ImportError:
39
40 from urllib.parse import urljoin
41 import copy
42 from lxml import etree
43 from lxml.html import defs
44 from lxml.html._setmixin import SetMixin
45 try:
46 from collections import MutableMapping as DictMixin
47 except ImportError:
48
49 from UserDict import DictMixin
50 try:
51 set
52 except NameError:
53
54 from sets import Set as set
55 try:
56 bytes
57 except NameError:
58
59 bytes = str
60 try:
61 unicode
62 except NameError:
63
64 unicode = str
65 try:
66 basestring
67 except NameError:
68
69 basestring = (str, bytes)
70
72 if not s:
73 return s
74 import sys
75 if sys.version_info[0] >= 3:
76 sub = re.compile(r"^(\s*)u'", re.M).sub
77 else:
78 sub = re.compile(r"^(\s*)b'", re.M).sub
79 return sub(r"\1'", s)
80
81 __all__ = [
82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
84 'find_rel_links', 'find_class', 'make_links_absolute',
85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
86
87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
88
89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
90 namespaces={'x':XHTML_NAMESPACE})
91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
92 namespaces={'x':XHTML_NAMESPACE})
93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
94 namespaces={'x':XHTML_NAMESPACE})
95
96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
98 _collect_string_content = etree.XPath("string()")
99 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I)
100 _css_import_re = re.compile(r'@import "(.*?)"')
101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
102 namespaces={'x':XHTML_NAMESPACE})
103 _archive_re = re.compile(r'[^ ]+')
104
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
107 return s[1:-1], pos+1
108 else:
109 return s,pos
110
120
126
128
130 """
131 Returns the base URL, given when the page was parsed.
132
133 Use with ``urlparse.urljoin(el.base_url, href)`` to get
134 absolute URLs.
135 """
136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__)
138
144 forms = property(forms, doc=forms.__doc__)
145
147 """
148 Return the <body> element. Can be called from a child element
149 to get the document's head.
150 """
151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__)
153
155 """
156 Returns the <head> element. Can be called from a child
157 element to get the document's head.
158 """
159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__)
161
163 """
164 Get or set any <label> element associated with this element.
165 """
166 id = self.get('id')
167 if not id:
168 return None
169 result = _label_xpath(self, id=id)
170 if not result:
171 return None
172 else:
173 return result[0]
175 id = self.get('id')
176 if not id:
177 raise TypeError(
178 "You cannot set a label for an element (%r) that has no id"
179 % self)
180 if _nons(label.tag) != 'label':
181 raise TypeError(
182 "You can only assign label to a label element (not %r)"
183 % label)
184 label.set('for', id)
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
190
192 """
193 Removes this element from the tree, including its children and
194 text. The tail text is joined to the previous element or
195 parent.
196 """
197 parent = self.getparent()
198 assert parent is not None
199 if self.tail:
200 previous = self.getprevious()
201 if previous is None:
202 parent.text = (parent.text or '') + self.tail
203 else:
204 previous.tail = (previous.tail or '') + self.tail
205 parent.remove(self)
206
208 """
209 Remove the tag, but not its children or text. The children and text
210 are merged into the parent.
211
212 Example::
213
214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
215 >>> h.find('.//b').drop_tag()
216 >>> print(tostring(h, encoding='unicode'))
217 <div>Hello World!</div>
218 """
219 parent = self.getparent()
220 assert parent is not None
221 previous = self.getprevious()
222 if self.text and isinstance(self.tag, basestring):
223
224 if previous is None:
225 parent.text = (parent.text or '') + self.text
226 else:
227 previous.tail = (previous.tail or '') + self.text
228 if self.tail:
229 if len(self):
230 last = self[-1]
231 last.tail = (last.tail or '') + self.tail
232 elif previous is None:
233 parent.text = (parent.text or '') + self.tail
234 else:
235 previous.tail = (previous.tail or '') + self.tail
236 index = parent.index(self)
237 parent[index:index+1] = self[:]
238
240 """
241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
242 """
243 rel = rel.lower()
244 return [el for el in _rel_links_xpath(self)
245 if el.get('rel').lower() == rel]
246
248 """
249 Find any elements with the given class name.
250 """
251 return _class_xpath(self, class_name=class_name)
252
254 """
255 Get the first element in a document with the given id. If none is
256 found, return the default argument if provided or raise KeyError
257 otherwise.
258
259 Note that there can be more than one element with the same id,
260 and this isn't uncommon in HTML documents found in the wild.
261 Browsers return only the first match, and this function does
262 the same.
263 """
264 try:
265
266
267 return _id_xpath(self, id=id)[0]
268 except IndexError:
269 if default:
270 return default[0]
271 else:
272 raise KeyError(id)
273
274 - def text_content(self):
275 """
276 Return the text content of the tag (and the text in any children).
277 """
278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """
282 Run the CSS expression on this element and its children,
283 returning a list of the results.
284
285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
286 -- note that pre-compiling the expression can provide a substantial
287 speedup.
288 """
289
290 from lxml.cssselect import CSSSelector
291 return CSSSelector(expr, translator=translator)(self)
292
293
294
295
296
297 - def make_links_absolute(self, base_url=None, resolve_base_href=True,
298 handle_failures=None):
299 """
300 Make all links in the document absolute, given the
301 ``base_url`` for the document (the full URL where the document
302 came from), or if no ``base_url`` is given, then the ``.base_url``
303 of the document.
304
305 If ``resolve_base_href`` is true, then any ``<base href>``
306 tags in the document are used *and* removed from the document.
307 If it is false then any such tag is ignored.
308
309 If ``handle_failures`` is None (default), a failure to process
310 a URL will abort the processing. If set to 'ignore', errors
311 are ignored. If set to 'discard', failing URLs will be removed.
312 """
313 if base_url is None:
314 base_url = self.base_url
315 if base_url is None:
316 raise TypeError(
317 "No base_url given, and the document has no base_url")
318 if resolve_base_href:
319 self.resolve_base_href()
320
321 if handle_failures == 'ignore':
322 def link_repl(href):
323 try:
324 return urljoin(base_url, href)
325 except ValueError:
326 return href
327 elif handle_failures == 'discard':
328 def link_repl(href):
329 try:
330 return urljoin(base_url, href)
331 except ValueError:
332 return None
333 elif handle_failures is None:
334 def link_repl(href):
335 return urljoin(base_url, href)
336 else:
337 raise ValueError(
338 "unexpected value for handle_failures: %r" % handle_failures)
339
340 self.rewrite_links(link_repl)
341
343 """
344 Find any ``<base href>`` tag in the document, and apply its
345 values to all links found in the document. Also remove the
346 tag once it has been applied.
347
348 If ``handle_failures`` is None (default), a failure to process
349 a URL will abort the processing. If set to 'ignore', errors
350 are ignored. If set to 'discard', failing URLs will be removed.
351 """
352 base_href = None
353 basetags = self.xpath('//base[@href]|//x:base[@href]',
354 namespaces={'x': XHTML_NAMESPACE})
355 for b in basetags:
356 base_href = b.get('href')
357 b.drop_tree()
358 if not base_href:
359 return
360 self.make_links_absolute(base_href, resolve_base_href=False,
361 handle_failures=handle_failures)
362
364 """
365 Yield (element, attribute, link, pos), where attribute may be None
366 (indicating the link is in the text). ``pos`` is the position
367 where the link occurs; often 0, but sometimes something else in
368 the case of links in stylesheets or style tags.
369
370 Note: <base href> is *not* taken into account in any way. The
371 link you get is exactly the link in the document.
372
373 Note: multiple links inside of a single text string or
374 attribute value are returned in reversed order. This makes it
375 possible to replace or delete them from the text string value
376 based on their reported text positions. Otherwise, a
377 modification at one text position can change the positions of
378 links reported later on.
379 """
380 link_attrs = defs.link_attrs
381 for el in self.iter(etree.Element):
382 attribs = el.attrib
383 tag = _nons(el.tag)
384 if tag != 'object':
385 for attrib in link_attrs:
386 if attrib in attribs:
387 yield (el, attrib, attribs[attrib], 0)
388 elif tag == 'object':
389 codebase = None
390
391
392 if 'codebase' in attribs:
393 codebase = el.get('codebase')
394 yield (el, 'codebase', codebase, 0)
395 for attrib in 'classid', 'data':
396 if attrib in attribs:
397 value = el.get(attrib)
398 if codebase is not None:
399 value = urljoin(codebase, value)
400 yield (el, attrib, value, 0)
401 if 'archive' in attribs:
402 for match in _archive_re.finditer(el.get('archive')):
403 value = match.group(0)
404 if codebase is not None:
405 value = urljoin(codebase, value)
406 yield (el, 'archive', value, match.start())
407 if tag == 'param':
408 valuetype = el.get('valuetype') or ''
409 if valuetype.lower() == 'ref':
410
411
412
413
414
415
416 yield (el, 'value', el.get('value'), 0)
417 if tag == 'style' and el.text:
418 urls = [
419 _unquote_match(match.group(1), match.start(1))
420 for match in _css_url_re.finditer(el.text)
421 ] + [
422 (match.group(1), match.start(1))
423 for match in _css_import_re.finditer(el.text)
424 ]
425 if urls:
426
427 urls = [ (start, url) for (url, start) in urls ]
428 urls.sort()
429
430
431 urls.reverse()
432 for start, url in urls:
433 yield (el, None, url, start)
434 if 'style' in attribs:
435 urls = list(_css_url_re.finditer(attribs['style']))
436 if urls:
437
438 for match in urls[::-1]:
439 url, start = _unquote_match(match.group(1), match.start(1))
440 yield (el, 'style', url, start)
441
442 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
443 base_href=None):
444 """
445 Rewrite all the links in the document. For each link
446 ``link_repl_func(link)`` will be called, and the return value
447 will replace the old link.
448
449 Note that links may not be absolute (unless you first called
450 ``make_links_absolute()``), and may be internal (e.g.,
451 ``'#anchor'``). They can also be values like
452 ``'mailto:email'`` or ``'javascript:expr'``.
453
454 If you give ``base_href`` then all links passed to
455 ``link_repl_func()`` will take that into account.
456
457 If the ``link_repl_func`` returns None, the attribute or
458 tag text will be removed completely.
459 """
460 if base_href is not None:
461
462
463 self.make_links_absolute(
464 base_href, resolve_base_href=resolve_base_href)
465 elif resolve_base_href:
466 self.resolve_base_href()
467
468 for el, attrib, link, pos in self.iterlinks():
469 new_link = link_repl_func(link.strip())
470 if new_link == link:
471 continue
472 if new_link is None:
473
474 if attrib is None:
475 el.text = ''
476 else:
477 del el.attrib[attrib]
478 continue
479
480 if attrib is None:
481 new = el.text[:pos] + new_link + el.text[pos+len(link):]
482 el.text = new
483 else:
484 cur = el.get(attrib)
485 if not pos and len(cur) == len(link):
486 new = new_link
487 else:
488 new = cur[:pos] + new_link + cur[pos+len(link):]
489 el.set(attrib, new)
490
491
493 """
494 An object that represents a method on an element as a function;
495 the function takes either an element or an HTML string. It
496 returns whatever the function normally returns, or if the function
497 works in-place (and so returns None) it returns a serialized form
498 of the resulting document.
499 """
505 result_type = type(doc)
506 if isinstance(doc, basestring):
507 if 'copy' in kw:
508 raise TypeError(
509 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
510 doc = fromstring(doc, **kw)
511 else:
512 if 'copy' in kw:
513 make_a_copy = kw.pop('copy')
514 else:
515 make_a_copy = self.copy
516 if make_a_copy:
517 doc = copy.deepcopy(doc)
518 meth = getattr(doc, self.name)
519 result = meth(*args, **kw)
520
521 if result is None:
522
523 return _transform_result(result_type, doc)
524 else:
525 return result
526
527 find_rel_links = _MethodFunc('find_rel_links', copy=False)
528 find_class = _MethodFunc('find_class', copy=False)
529 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
530 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
531 iterlinks = _MethodFunc('iterlinks', copy=False)
532 rewrite_links = _MethodFunc('rewrite_links', copy=True)
533
536
539
542
545
546
548 """A lookup scheme for HTML Element classes.
549
550 To create a lookup instance with different Element classes, pass a tag
551 name mapping of Element classes in the ``classes`` keyword argument and/or
552 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
553 The special key '*' denotes a Mixin class that should be mixed into all
554 Element classes.
555 """
556 _default_element_classes = {}
557
558 - def __init__(self, classes=None, mixins=None):
575
576 - def lookup(self, node_type, document, namespace, name):
587
588
589
590
591
592 _looks_like_full_html_unicode = re.compile(
593 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
594 _looks_like_full_html_bytes = re.compile(
595 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
596
605
608 """
609 Parses several HTML elements, returning a list of elements.
610
611 The first item in the list may be a string (though leading
612 whitespace is removed). If no_leading_text is true, then it will
613 be an error if there is leading text, and it will always be a list
614 of only elements.
615
616 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
617 """
618 if parser is None:
619 parser = html_parser
620
621 if isinstance(html, bytes):
622 if not _looks_like_full_html_bytes(html):
623 html = '<html><body>%s</body></html>'.encode('ascii') % html
624 else:
625 if not _looks_like_full_html_unicode(html):
626 html = '<html><body>%s</body></html>' % html
627 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
628 assert _nons(doc.tag) == 'html'
629 bodies = [e for e in doc if _nons(e.tag) == 'body']
630 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
631 body = bodies[0]
632 elements = []
633 if no_leading_text and body.text and body.text.strip():
634 raise etree.ParserError(
635 "There is leading text: %r" % body.text)
636 if body.text and body.text.strip():
637 elements.append(body.text)
638 elements.extend(body)
639
640
641 return elements
642
645 """
646 Parses a single HTML element; it is an error if there is more than
647 one element, or if anything but whitespace precedes or follows the
648 element.
649
650 If create_parent is true (or is a tag name) then a parent node
651 will be created to encapsulate the HTML in a single element. In
652 this case, leading or trailing text is allowed.
653
654 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
655 """
656 if parser is None:
657 parser = html_parser
658
659 accept_leading_text = bool(create_parent)
660
661 elements = fragments_fromstring(
662 html, parser=parser, no_leading_text=not accept_leading_text,
663 base_url=base_url, **kw)
664
665 if create_parent:
666 if not isinstance(create_parent, basestring):
667 create_parent = 'div'
668 new_root = Element(create_parent)
669 if elements:
670 if isinstance(elements[0], basestring):
671 new_root.text = elements[0]
672 del elements[0]
673 new_root.extend(elements)
674 return new_root
675
676 if not elements:
677 raise etree.ParserError('No elements found')
678 if len(elements) > 1:
679 raise etree.ParserError(
680 "Multiple elements found (%s)"
681 % ', '.join([_element_name(e) for e in elements]))
682 el = elements[0]
683 if el.tail and el.tail.strip():
684 raise etree.ParserError(
685 "Element followed by text: %r" % el.tail)
686 el.tail = None
687 return el
688
689 -def fromstring(html, base_url=None, parser=None, **kw):
755
756 -def parse(filename_or_url, parser=None, base_url=None, **kw):
757 """
758 Parse a filename, URL, or file-like object into an HTML document
759 tree. Note: this returns a tree, not an element. Use
760 ``parse(...).getroot()`` to get the document root.
761
762 You can override the base URL with the ``base_url`` keyword. This
763 is most useful when parsing from a file-like object.
764 """
765 if parser is None:
766 parser = html_parser
767 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
768
776
778 if isinstance(el, etree.CommentBase):
779 return 'comment'
780 elif isinstance(el, basestring):
781 return 'string'
782 else:
783 return _nons(el.tag)
784
785
786
787
788
893
894 HtmlElementClassLookup._default_element_classes['form'] = FormElement
895
932
934 if not url:
935 raise ValueError("cannot submit, no URL provided")
936
937 try:
938 from urllib import urlencode, urlopen
939 except ImportError:
940 from urllib.request import urlopen
941 from urllib.parse import urlencode
942 if method == 'GET':
943 if '?' in url:
944 url += '&'
945 else:
946 url += '?'
947 url += urlencode(values)
948 data = None
949 else:
950 data = urlencode(values)
951 return urlopen(url, data)
952
954
962 raise KeyError(
963 "You cannot remove keys from ElementDict")
967 return item in self.inputs
972
974 return '<%s for form %s>' % (
975 self.__class__.__name__,
976 self.inputs.form._name())
977
1043
1071
1072 -class TextareaElement(InputMixin, HtmlElement):
1073 """
1074 ``<textarea>`` element. You can get the name with ``.name`` and
1075 get/set the value with ``.value``
1076 """
1077
1078 - def _value__get(self):
1079 """
1080 Get/set the value (which is the contents of this element)
1081 """
1082 content = self.text or ''
1083 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1084 serialisation_method = 'xml'
1085 else:
1086 serialisation_method = 'html'
1087 for el in self:
1088
1089 content += etree.tostring(
1090 el, method=serialisation_method, encoding='unicode')
1091 return content
1092 - def _value__set(self, value):
1093 del self[:]
1094 self.text = value
1095 - def _value__del(self):
1096 self.text = ''
1097 del self[:]
1098 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1099
1100 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1101
1103 """
1104 ``<select>`` element. You can get the name with ``.name``.
1105
1106 ``.value`` will be the value of the selected option, unless this
1107 is a multi-select element (``<select multiple>``), in which case
1108 it will be a set-like object. In either case ``.value_options``
1109 gives the possible values.
1110
1111 The boolean attribute ``.multiple`` shows if this is a
1112 multi-select.
1113 """
1114
1116 """
1117 Get/set the value of this select (the selected option).
1118
1119 If this is a multi-select, this is a set-like object that
1120 represents all the selected options.
1121 """
1122 if self.multiple:
1123 return MultipleSelectOptions(self)
1124 for el in _options_xpath(self):
1125 if el.get('selected') is not None:
1126 value = el.get('value')
1127 if value is None:
1128 value = el.text or ''
1129 if value:
1130 value = value.strip()
1131 return value
1132 return None
1133
1135 if self.multiple:
1136 if isinstance(value, basestring):
1137 raise TypeError(
1138 "You must pass in a sequence")
1139 self.value.clear()
1140 self.value.update(value)
1141 return
1142 if value is not None:
1143 value = value.strip()
1144 for el in _options_xpath(self):
1145 opt_value = el.get('value')
1146 if opt_value is None:
1147 opt_value = el.text or ''
1148 if opt_value:
1149 opt_value = opt_value.strip()
1150 if opt_value == value:
1151 checked_option = el
1152 break
1153 else:
1154 raise ValueError(
1155 "There is no option with the value of %r" % value)
1156 for el in _options_xpath(self):
1157 if 'selected' in el.attrib:
1158 del el.attrib['selected']
1159 if value is not None:
1160 checked_option.set('selected', '')
1161
1168
1169 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1170
1185 value_options = property(value_options, doc=value_options.__doc__)
1186
1188 """
1189 Boolean attribute: is there a ``multiple`` attribute on this element.
1190 """
1191 return 'multiple' in self.attrib
1193 if value:
1194 self.set('multiple', '')
1195 elif 'multiple' in self.attrib:
1196 del self.attrib['multiple']
1197 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1198
1199 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1200
1202 """
1203 Represents all the selected options in a ``<select multiple>`` element.
1204
1205 You can add to this set-like option to select an option, or remove
1206 to unselect the option.
1207 """
1208
1210 self.select = select
1211
1213 """
1214 Iterator of all the ``<option>`` elements.
1215 """
1216 return iter(_options_xpath(self.select))
1217 options = property(options)
1218
1220 for option in self.options:
1221 if 'selected' in option.attrib:
1222 opt_value = option.get('value')
1223 if opt_value is None:
1224 opt_value = option.text or ''
1225 if opt_value:
1226 opt_value = opt_value.strip()
1227 yield opt_value
1228
1229 - def add(self, item):
1230 for option in self.options:
1231 opt_value = option.get('value')
1232 if opt_value is None:
1233 opt_value = option.text or ''
1234 if opt_value:
1235 opt_value = opt_value.strip()
1236 if opt_value == item:
1237 option.set('selected', '')
1238 break
1239 else:
1240 raise ValueError(
1241 "There is no option with the value %r" % item)
1242
1244 for option in self.options:
1245 opt_value = option.get('value')
1246 if opt_value is None:
1247 opt_value = option.text or ''
1248 if opt_value:
1249 opt_value = opt_value.strip()
1250 if opt_value == item:
1251 if 'selected' in option.attrib:
1252 del option.attrib['selected']
1253 else:
1254 raise ValueError(
1255 "The option %r is not currently selected" % item)
1256 break
1257 else:
1258 raise ValueError(
1259 "There is not option with the value %r" % item)
1260
1262 return '<%s {%s} for select name=%r>' % (
1263 self.__class__.__name__,
1264 ', '.join([repr(v) for v in self]),
1265 self.select.name)
1266
1268 """
1269 This object represents several ``<input type=radio>`` elements
1270 that have the same name.
1271
1272 You can use this like a list, but also use the property
1273 ``.value`` to check/uncheck inputs. Also you can use
1274 ``.value_options`` to get the possible values.
1275 """
1276
1278 """
1279 Get/set the value, which checks the radio with that value (and
1280 unchecks any other value).
1281 """
1282 for el in self:
1283 if 'checked' in el.attrib:
1284 return el.get('value')
1285 return None
1286
1288 if value is not None:
1289 for el in self:
1290 if el.get('value') == value:
1291 checked_option = el
1292 break
1293 else:
1294 raise ValueError(
1295 "There is no radio input with the value %r" % value)
1296 for el in self:
1297 if 'checked' in el.attrib:
1298 del el.attrib['checked']
1299 if value is not None:
1300 checked_option.set('checked', '')
1301
1304
1305 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1306
1308 """
1309 Returns a list of all the possible values.
1310 """
1311 return [el.get('value') for el in self]
1312 value_options = property(value_options, doc=value_options.__doc__)
1313
1315 return '%s(%s)' % (
1316 self.__class__.__name__,
1317 list.__repr__(self))
1318
1320 """
1321 Represents a group of checkboxes (``<input type=checkbox>``) that
1322 have the same name.
1323
1324 In addition to using this like a list, the ``.value`` attribute
1325 returns a set-like object that you can add to or remove from to
1326 check and uncheck checkboxes. You can also use ``.value_options``
1327 to get the possible values.
1328 """
1329
1331 """
1332 Return a set-like object that can be modified to check or
1333 uncheck individual checkboxes according to their value.
1334 """
1335 return CheckboxValues(self)
1345 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1346
1348 """
1349 Returns a list of all the possible values.
1350 """
1351 return [el.get('value') for el in self]
1352 value_options = property(value_options, doc=value_options.__doc__)
1353
1355 return '%s(%s)' % (
1356 self.__class__.__name__, list.__repr__(self))
1357
1359
1360 """
1361 Represents the values of the checked checkboxes in a group of
1362 checkboxes with the same name.
1363 """
1364
1367
1369 return iter([
1370 el.get('value')
1371 for el in self.group
1372 if 'checked' in el.attrib])
1373
1374 - def add(self, value):
1375 for el in self.group:
1376 if el.get('value') == value:
1377 el.set('checked', '')
1378 break
1379 else:
1380 raise KeyError("No checkbox with value %r" % value)
1381
1383 for el in self.group:
1384 if el.get('value') == value:
1385 if 'checked' in el.attrib:
1386 del el.attrib['checked']
1387 else:
1388 raise KeyError(
1389 "The checkbox with value %r was already unchecked" % value)
1390 break
1391 else:
1392 raise KeyError(
1393 "No checkbox with value %r" % value)
1394
1396 return '<%s {%s} for checkboxes name=%r>' % (
1397 self.__class__.__name__,
1398 ', '.join([repr(v) for v in self]),
1399 self.group.name)
1400
1484
1485 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1486
1488 """
1489 Represents a ``<label>`` element.
1490
1491 Label elements are linked to other elements with their ``for``
1492 attribute. You can access this element with ``label.for_element``.
1493 """
1494
1496 """
1497 Get/set the element this label points to. Return None if it
1498 can't be found.
1499 """
1500 id = self.get('for')
1501 if not id:
1502 return None
1503 return self.body.get_element_by_id(id)
1505 id = other.get('id')
1506 if not id:
1507 raise TypeError(
1508 "Element %r has no id attribute" % other)
1509 self.set('for', id)
1513 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1514 doc=_for_element__get.__doc__)
1515
1516 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1517
1518
1519
1520
1521
1535
1537 """Convert all tags in an XHTML tree to HTML by removing their
1538 XHTML namespace.
1539 """
1540 try:
1541 xhtml = xhtml.getroot()
1542 except AttributeError:
1543 pass
1544 prefix = "{%s}" % XHTML_NAMESPACE
1545 prefix_len = len(prefix)
1546 for el in xhtml.iter(prefix + "*"):
1547 el.tag = el.tag[prefix_len:]
1548
1549
1550
1551 __str_replace_meta_content_type = re.compile(
1552 r'<meta http-equiv="Content-Type"[^>]*>').sub
1553 __bytes_replace_meta_content_type = re.compile(
1554 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1555
1556 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1557 encoding=None, method="html", with_tail=True, doctype=None):
1558 """Return an HTML string representation of the document.
1559
1560 Note: if include_meta_content_type is true this will create a
1561 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1562 regardless of the value of include_meta_content_type any existing
1563 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1564
1565 The ``encoding`` argument controls the output encoding (defauts to
1566 ASCII, with &#...; character references for any characters outside
1567 of ASCII). Note that you can pass the name ``'unicode'`` as
1568 ``encoding`` argument to serialise to a Unicode string.
1569
1570 The ``method`` argument defines the output method. It defaults to
1571 'html', but can also be 'xml' for xhtml output, or 'text' to
1572 serialise to plain text without markup.
1573
1574 To leave out the tail text of the top-level element that is being
1575 serialised, pass ``with_tail=False``.
1576
1577 The ``doctype`` option allows passing in a plain string that will
1578 be serialised before the XML tree. Note that passing in non
1579 well-formed content here will make the XML output non well-formed.
1580 Also, an existing doctype in the document tree will not be removed
1581 when serialising an ElementTree instance.
1582
1583 Example::
1584
1585 >>> from lxml import html
1586 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1587
1588 >>> html.tostring(root)
1589 b'<p>Hello<br>world!</p>'
1590 >>> html.tostring(root, method='html')
1591 b'<p>Hello<br>world!</p>'
1592
1593 >>> html.tostring(root, method='xml')
1594 b'<p>Hello<br/>world!</p>'
1595
1596 >>> html.tostring(root, method='text')
1597 b'Helloworld!'
1598
1599 >>> html.tostring(root, method='text', encoding='unicode')
1600 u'Helloworld!'
1601
1602 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1603 >>> html.tostring(root[0], method='text', encoding='unicode')
1604 u'Helloworld!TAIL'
1605
1606 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1607 u'Helloworld!'
1608
1609 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1610 >>> html.tostring(doc, method='html', encoding='unicode')
1611 u'<html><body><p>Hello<br>world!</p></body></html>'
1612
1613 >>> print(html.tostring(doc, method='html', encoding='unicode',
1614 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1615 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1616 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1617 <html><body><p>Hello<br>world!</p></body></html>
1618 """
1619 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1620 encoding=encoding, with_tail=with_tail,
1621 doctype=doctype)
1622 if method == 'html' and not include_meta_content_type:
1623 if isinstance(html, str):
1624 html = __str_replace_meta_content_type('', html)
1625 else:
1626 html = __bytes_replace_meta_content_type(bytes(), html)
1627 return html
1628
1629 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1630
1632 """
1633 Open the HTML document in a web browser, saving it to a temporary
1634 file to open it. Note that this does not delete the file after
1635 use. This is mainly meant for debugging.
1636 """
1637 import os
1638 import webbrowser
1639 import tempfile
1640 if not isinstance(doc, etree._ElementTree):
1641 doc = etree.ElementTree(doc)
1642 handle, fn = tempfile.mkstemp(suffix='.html')
1643 f = os.fdopen(handle, 'wb')
1644 try:
1645 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1646 finally:
1647
1648 f.close()
1649 url = 'file://' + fn.replace(os.path.sep, '/')
1650 print(url)
1651 webbrowser.open(url)
1652
1653
1654
1655
1656
1658 """An HTML parser that is configured to return lxml.html Element
1659 objects.
1660 """
1664
1666 """An XML parser that is configured to return lxml.html Element
1667 objects.
1668
1669 Note that this parser is not really XHTML aware unless you let it
1670 load a DTD that declares the HTML entities. To do this, make sure
1671 you have the XHTML DTDs installed in your catalogs, and create the
1672 parser like this::
1673
1674 >>> parser = XHTMLParser(load_dtd=True)
1675
1676 If you additionally want to validate the document, use this::
1677
1678 >>> parser = XHTMLParser(dtd_validation=True)
1679
1680 For catalog support, see http://www.xmlsoft.org/catalog.html.
1681 """
1685
1687 """Create a new HTML Element.
1688
1689 This can also be used for XHTML documents.
1690 """
1691 v = html_parser.makeelement(*args, **kw)
1692 return v
1693
1694 html_parser = HTMLParser()
1695 xhtml_parser = XHTMLParser()
1696