Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/html.py: 16%
366 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2:mod:`pandas.io.html` is a module containing functionality for dealing with
3HTML IO.
5"""
7from __future__ import annotations
9from collections import abc
10import numbers
11import re
12from typing import (
13 TYPE_CHECKING,
14 Iterable,
15 Literal,
16 Pattern,
17 Sequence,
18 cast,
19)
21from pandas._typing import (
22 FilePath,
23 ReadBuffer,
24)
25from pandas.compat._optional import import_optional_dependency
26from pandas.errors import (
27 AbstractMethodError,
28 EmptyDataError,
29)
30from pandas.util._decorators import deprecate_nonkeyword_arguments
32from pandas.core.dtypes.common import is_list_like
34from pandas import isna
35from pandas.core.construction import create_series_with_explicit_dtype
36from pandas.core.indexes.base import Index
37from pandas.core.indexes.multi import MultiIndex
39from pandas.io.common import (
40 file_exists,
41 get_handle,
42 is_url,
43 stringify_path,
44 urlopen,
45 validate_header_arg,
46)
47from pandas.io.formats.printing import pprint_thing
48from pandas.io.parsers import TextParser
50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true
51 from pandas import DataFrame
53_IMPORTS = False
54_HAS_BS4 = False
55_HAS_LXML = False
56_HAS_HTML5LIB = False
59def _importers() -> None:
60 # import things we need
61 # but make this done on a first use basis
63 global _IMPORTS
64 if _IMPORTS:
65 return
67 global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
68 bs4 = import_optional_dependency("bs4", errors="ignore")
69 _HAS_BS4 = bs4 is not None
71 lxml = import_optional_dependency("lxml.etree", errors="ignore")
72 _HAS_LXML = lxml is not None
74 html5lib = import_optional_dependency("html5lib", errors="ignore")
75 _HAS_HTML5LIB = html5lib is not None
77 _IMPORTS = True
80#############
81# READ HTML #
82#############
83_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
86def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
87 """
88 Replace extra whitespace inside of a string with a single space.
90 Parameters
91 ----------
92 s : str or unicode
93 The string from which to remove extra whitespace.
94 regex : re.Pattern
95 The regular expression to use to remove extra whitespace.
97 Returns
98 -------
99 subd : str or unicode
100 `s` with all extra whitespace replaced with a single space.
101 """
102 return regex.sub(" ", s.strip())
105def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:
106 """
107 Get an iterator given an integer, slice or container.
109 Parameters
110 ----------
111 skiprows : int, slice, container
112 The iterator to use to skip rows; can also be a slice.
114 Raises
115 ------
116 TypeError
117 * If `skiprows` is not a slice, integer, or Container
119 Returns
120 -------
121 it : iterable
122 A proper iterator to use to skip rows of a DataFrame.
123 """
124 if isinstance(skiprows, slice):
125 start, step = skiprows.start or 0, skiprows.step or 1
126 return list(range(start, skiprows.stop, step))
127 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
128 return cast("int | Sequence[int]", skiprows)
129 elif skiprows is None:
130 return 0
131 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
134def _read(
135 obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None
136) -> str | bytes:
137 """
138 Try to read from a url, file or string.
140 Parameters
141 ----------
142 obj : str, unicode, path object, or file-like object
144 Returns
145 -------
146 raw_text : str
147 """
148 text: str | bytes
149 if (
150 is_url(obj)
151 or hasattr(obj, "read")
152 or (isinstance(obj, str) and file_exists(obj))
153 ):
154 # error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes,
155 # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]";
156 # expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase,
157 # BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]"
158 with get_handle(
159 obj, "r", encoding=encoding # type: ignore[arg-type]
160 ) as handles:
161 text = handles.handle.read()
162 elif isinstance(obj, (str, bytes)):
163 text = obj
164 else:
165 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
166 return text
169class _HtmlFrameParser:
170 """
171 Base class for parsers that parse HTML into DataFrames.
173 Parameters
174 ----------
175 io : str or file-like
176 This can be either a string of raw HTML, a valid URL using the HTTP,
177 FTP, or FILE protocols or a file-like object.
179 match : str or regex
180 The text to match in the document.
182 attrs : dict
183 List of HTML <table> element attributes to match.
185 encoding : str
186 Encoding to be used by parser
188 displayed_only : bool
189 Whether or not items with "display:none" should be ignored
191 extract_links : {None, "all", "header", "body", "footer"}
192 Table elements in the specified section(s) with <a> tags will have their
193 href extracted.
195 .. versionadded:: 1.5.0
197 Attributes
198 ----------
199 io : str or file-like
200 raw HTML, URL, or file-like object
202 match : regex
203 The text to match in the raw HTML
205 attrs : dict-like
206 A dictionary of valid table attributes to use to search for table
207 elements.
209 encoding : str
210 Encoding to be used by parser
212 displayed_only : bool
213 Whether or not items with "display:none" should be ignored
215 extract_links : {None, "all", "header", "body", "footer"}
216 Table elements in the specified section(s) with <a> tags will have their
217 href extracted.
219 .. versionadded:: 1.5.0
221 Notes
222 -----
223 To subclass this class effectively you must override the following methods:
224 * :func:`_build_doc`
225 * :func:`_attr_getter`
226 * :func:`_href_getter`
227 * :func:`_text_getter`
228 * :func:`_parse_td`
229 * :func:`_parse_thead_tr`
230 * :func:`_parse_tbody_tr`
231 * :func:`_parse_tfoot_tr`
232 * :func:`_parse_tables`
233 * :func:`_equals_tag`
234 See each method's respective documentation for details on their
235 functionality.
236 """
238 def __init__(
239 self,
240 io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
241 match: str | Pattern,
242 attrs: dict[str, str] | None,
243 encoding: str,
244 displayed_only: bool,
245 extract_links: Literal[None, "header", "footer", "body", "all"],
246 ) -> None:
247 self.io = io
248 self.match = match
249 self.attrs = attrs
250 self.encoding = encoding
251 self.displayed_only = displayed_only
252 self.extract_links = extract_links
254 def parse_tables(self):
255 """
256 Parse and return all tables from the DOM.
258 Returns
259 -------
260 list of parsed (header, body, footer) tuples from tables.
261 """
262 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
263 return (self._parse_thead_tbody_tfoot(table) for table in tables)
265 def _attr_getter(self, obj, attr):
266 """
267 Return the attribute value of an individual DOM node.
269 Parameters
270 ----------
271 obj : node-like
272 A DOM node.
274 attr : str or unicode
275 The attribute, such as "colspan"
277 Returns
278 -------
279 str or unicode
280 The attribute value.
281 """
282 # Both lxml and BeautifulSoup have the same implementation:
283 return obj.get(attr)
285 def _href_getter(self, obj):
286 """
287 Return a href if the DOM node contains a child <a> or None.
289 Parameters
290 ----------
291 obj : node-like
292 A DOM node.
294 Returns
295 -------
296 href : str or unicode
297 The href from the <a> child of the DOM node.
298 """
299 raise AbstractMethodError(self)
301 def _text_getter(self, obj):
302 """
303 Return the text of an individual DOM node.
305 Parameters
306 ----------
307 obj : node-like
308 A DOM node.
310 Returns
311 -------
312 text : str or unicode
313 The text from an individual DOM node.
314 """
315 raise AbstractMethodError(self)
317 def _parse_td(self, obj):
318 """
319 Return the td elements from a row element.
321 Parameters
322 ----------
323 obj : node-like
324 A DOM <tr> node.
326 Returns
327 -------
328 list of node-like
329 These are the elements of each row, i.e., the columns.
330 """
331 raise AbstractMethodError(self)
333 def _parse_thead_tr(self, table):
334 """
335 Return the list of thead row elements from the parsed table element.
337 Parameters
338 ----------
339 table : a table element that contains zero or more thead elements.
341 Returns
342 -------
343 list of node-like
344 These are the <tr> row elements of a table.
345 """
346 raise AbstractMethodError(self)
348 def _parse_tbody_tr(self, table):
349 """
350 Return the list of tbody row elements from the parsed table element.
352 HTML5 table bodies consist of either 0 or more <tbody> elements (which
353 only contain <tr> elements) or 0 or more <tr> elements. This method
354 checks for both structures.
356 Parameters
357 ----------
358 table : a table element that contains row elements.
360 Returns
361 -------
362 list of node-like
363 These are the <tr> row elements of a table.
364 """
365 raise AbstractMethodError(self)
367 def _parse_tfoot_tr(self, table):
368 """
369 Return the list of tfoot row elements from the parsed table element.
371 Parameters
372 ----------
373 table : a table element that contains row elements.
375 Returns
376 -------
377 list of node-like
378 These are the <tr> row elements of a table.
379 """
380 raise AbstractMethodError(self)
382 def _parse_tables(self, doc, match, attrs):
383 """
384 Return all tables from the parsed DOM.
386 Parameters
387 ----------
388 doc : the DOM from which to parse the table element.
390 match : str or regular expression
391 The text to search for in the DOM tree.
393 attrs : dict
394 A dictionary of table attributes that can be used to disambiguate
395 multiple tables on a page.
397 Raises
398 ------
399 ValueError : `match` does not match any text in the document.
401 Returns
402 -------
403 list of node-like
404 HTML <table> elements to be parsed into raw data.
405 """
406 raise AbstractMethodError(self)
408 def _equals_tag(self, obj, tag):
409 """
410 Return whether an individual DOM node matches a tag
412 Parameters
413 ----------
414 obj : node-like
415 A DOM node.
417 tag : str
418 Tag name to be checked for equality.
420 Returns
421 -------
422 boolean
423 Whether `obj`'s tag name is `tag`
424 """
425 raise AbstractMethodError(self)
427 def _build_doc(self):
428 """
429 Return a tree-like object that can be used to iterate over the DOM.
431 Returns
432 -------
433 node-like
434 The DOM from which to parse the table element.
435 """
436 raise AbstractMethodError(self)
438 def _parse_thead_tbody_tfoot(self, table_html):
439 """
440 Given a table, return parsed header, body, and foot.
442 Parameters
443 ----------
444 table_html : node-like
446 Returns
447 -------
448 tuple of (header, body, footer), each a list of list-of-text rows.
450 Notes
451 -----
452 Header and body are lists-of-lists. Top level list is a list of
453 rows. Each row is a list of str text.
455 Logic: Use <thead>, <tbody>, <tfoot> elements to identify
456 header, body, and footer, otherwise:
457 - Put all rows into body
458 - Move rows from top of body to header only if
459 all elements inside row are <th>
460 - Move rows from bottom of body to footer only if
461 all elements inside row are <th>
462 """
463 header_rows = self._parse_thead_tr(table_html)
464 body_rows = self._parse_tbody_tr(table_html)
465 footer_rows = self._parse_tfoot_tr(table_html)
467 def row_is_all_th(row):
468 return all(self._equals_tag(t, "th") for t in self._parse_td(row))
470 if not header_rows:
471 # The table has no <thead>. Move the top all-<th> rows from
472 # body_rows to header_rows. (This is a common case because many
473 # tables in the wild have no <thead> or <tfoot>
474 while body_rows and row_is_all_th(body_rows[0]):
475 header_rows.append(body_rows.pop(0))
477 header = self._expand_colspan_rowspan(header_rows, section="header")
478 body = self._expand_colspan_rowspan(body_rows, section="body")
479 footer = self._expand_colspan_rowspan(footer_rows, section="footer")
481 return header, body, footer
483 def _expand_colspan_rowspan(
484 self, rows, section: Literal["header", "footer", "body"]
485 ):
486 """
487 Given a list of <tr>s, return a list of text rows.
489 Parameters
490 ----------
491 rows : list of node-like
492 List of <tr>s
493 section : the section that the rows belong to (header, body or footer).
495 Returns
496 -------
497 list of list
498 Each returned row is a list of str text, or tuple (text, link)
499 if extract_links is not None.
501 Notes
502 -----
503 Any cell with ``rowspan`` or ``colspan`` will have its contents copied
504 to subsequent cells.
505 """
506 all_texts = [] # list of rows, each a list of str
507 text: str | tuple
508 remainder: list[
509 tuple[int, str | tuple, int]
510 ] = [] # list of (index, text, nrows)
512 for tr in rows:
513 texts = [] # the output for this row
514 next_remainder = []
516 index = 0
517 tds = self._parse_td(tr)
518 for td in tds:
519 # Append texts from previous rows with rowspan>1 that come
520 # before this <td>
521 while remainder and remainder[0][0] <= index:
522 prev_i, prev_text, prev_rowspan = remainder.pop(0)
523 texts.append(prev_text)
524 if prev_rowspan > 1:
525 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
526 index += 1
528 # Append the text from this <td>, colspan times
529 text = _remove_whitespace(self._text_getter(td))
530 if self.extract_links == "all" or self.extract_links == section:
531 href = self._href_getter(td)
532 text = (text, href)
533 rowspan = int(self._attr_getter(td, "rowspan") or 1)
534 colspan = int(self._attr_getter(td, "colspan") or 1)
536 for _ in range(colspan):
537 texts.append(text)
538 if rowspan > 1:
539 next_remainder.append((index, text, rowspan - 1))
540 index += 1
542 # Append texts from previous rows at the final position
543 for prev_i, prev_text, prev_rowspan in remainder:
544 texts.append(prev_text)
545 if prev_rowspan > 1:
546 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
548 all_texts.append(texts)
549 remainder = next_remainder
551 # Append rows that only appear because the previous row had non-1
552 # rowspan
553 while remainder:
554 next_remainder = []
555 texts = []
556 for prev_i, prev_text, prev_rowspan in remainder:
557 texts.append(prev_text)
558 if prev_rowspan > 1:
559 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
560 all_texts.append(texts)
561 remainder = next_remainder
563 return all_texts
565 def _handle_hidden_tables(self, tbl_list, attr_name):
566 """
567 Return list of tables, potentially removing hidden elements
569 Parameters
570 ----------
571 tbl_list : list of node-like
572 Type of list elements will vary depending upon parser used
573 attr_name : str
574 Name of the accessor for retrieving HTML attributes
576 Returns
577 -------
578 list of node-like
579 Return type matches `tbl_list`
580 """
581 if not self.displayed_only:
582 return tbl_list
584 return [
585 x
586 for x in tbl_list
587 if "display:none"
588 not in getattr(x, attr_name).get("style", "").replace(" ", "")
589 ]
592class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
593 """
594 HTML to DataFrame parser that uses BeautifulSoup under the hood.
596 See Also
597 --------
598 pandas.io.html._HtmlFrameParser
599 pandas.io.html._LxmlFrameParser
601 Notes
602 -----
603 Documentation strings for this class are in the base class
604 :class:`pandas.io.html._HtmlFrameParser`.
605 """
607 def __init__(self, *args, **kwargs) -> None:
608 super().__init__(*args, **kwargs)
609 from bs4 import SoupStrainer
611 self._strainer = SoupStrainer("table")
613 def _parse_tables(self, doc, match, attrs):
614 element_name = self._strainer.name
615 tables = doc.find_all(element_name, attrs=attrs)
617 if not tables:
618 raise ValueError("No tables found")
620 result = []
621 unique_tables = set()
622 tables = self._handle_hidden_tables(tables, "attrs")
624 for table in tables:
625 if self.displayed_only:
626 for elem in table.find_all(style=re.compile(r"display:\s*none")):
627 elem.decompose()
629 if table not in unique_tables and table.find(string=match) is not None:
630 result.append(table)
631 unique_tables.add(table)
633 if not result:
634 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
635 return result
637 def _href_getter(self, obj) -> str | None:
638 a = obj.find("a", href=True)
639 return None if not a else a["href"]
641 def _text_getter(self, obj):
642 return obj.text
644 def _equals_tag(self, obj, tag):
645 return obj.name == tag
647 def _parse_td(self, row):
648 return row.find_all(("td", "th"), recursive=False)
650 def _parse_thead_tr(self, table):
651 return table.select("thead tr")
653 def _parse_tbody_tr(self, table):
654 from_tbody = table.select("tbody tr")
655 from_root = table.find_all("tr", recursive=False)
656 # HTML spec: at most one of these lists has content
657 return from_tbody + from_root
659 def _parse_tfoot_tr(self, table):
660 return table.select("tfoot tr")
662 def _setup_build_doc(self):
663 raw_text = _read(self.io, self.encoding)
664 if not raw_text:
665 raise ValueError(f"No text parsed from document: {self.io}")
666 return raw_text
668 def _build_doc(self):
669 from bs4 import BeautifulSoup
671 bdoc = self._setup_build_doc()
672 if isinstance(bdoc, bytes) and self.encoding is not None:
673 udoc = bdoc.decode(self.encoding)
674 from_encoding = None
675 else:
676 udoc = bdoc
677 from_encoding = self.encoding
679 soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
681 for br in soup.find_all("br"):
682 br.replace_with("\n" + br.text)
684 return soup
687def _build_xpath_expr(attrs) -> str:
688 """
689 Build an xpath expression to simulate bs4's ability to pass in kwargs to
690 search for attributes when using the lxml parser.
692 Parameters
693 ----------
694 attrs : dict
695 A dict of HTML attributes. These are NOT checked for validity.
697 Returns
698 -------
699 expr : unicode
700 An XPath expression that checks for the given HTML attributes.
701 """
702 # give class attribute as class_ because class is a python keyword
703 if "class_" in attrs:
704 attrs["class"] = attrs.pop("class_")
706 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
707 return f"[{s}]"
710_re_namespace = {"re": "http://exslt.org/regular-expressions"}
713class _LxmlFrameParser(_HtmlFrameParser):
714 """
715 HTML to DataFrame parser that uses lxml under the hood.
717 Warning
718 -------
719 This parser can only handle HTTP, FTP, and FILE urls.
721 See Also
722 --------
723 _HtmlFrameParser
724 _BeautifulSoupLxmlFrameParser
726 Notes
727 -----
728 Documentation strings for this class are in the base class
729 :class:`_HtmlFrameParser`.
730 """
732 def _href_getter(self, obj) -> str | None:
733 href = obj.xpath(".//a/@href")
734 return None if not href else href[0]
736 def _text_getter(self, obj):
737 return obj.text_content()
739 def _parse_td(self, row):
740 # Look for direct children only: the "row" element here may be a
741 # <thead> or <tfoot> (see _parse_thead_tr).
742 return row.xpath("./td|./th")
744 def _parse_tables(self, doc, match, kwargs):
745 pattern = match.pattern
747 # 1. check all descendants for the given pattern and only search tables
748 # 2. go up the tree until we find a table
749 xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table"
751 # if any table attributes were given build an xpath expression to
752 # search for them
753 if kwargs:
754 xpath_expr += _build_xpath_expr(kwargs)
756 tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
758 tables = self._handle_hidden_tables(tables, "attrib")
759 if self.displayed_only:
760 for table in tables:
761 # lxml utilizes XPATH 1.0 which does not have regex
762 # support. As a result, we find all elements with a style
763 # attribute and iterate them to check for display:none
764 for elem in table.xpath(".//*[@style]"):
765 if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
766 elem.getparent().remove(elem)
768 if not tables:
769 raise ValueError(f"No tables found matching regex {repr(pattern)}")
770 return tables
772 def _equals_tag(self, obj, tag):
773 return obj.tag == tag
775 def _build_doc(self):
776 """
777 Raises
778 ------
779 ValueError
780 * If a URL that lxml cannot parse is passed.
782 Exception
783 * Any other ``Exception`` thrown. For example, trying to parse a
784 URL that is syntactically correct on a machine with no internet
785 connection will fail.
787 See Also
788 --------
789 pandas.io.html._HtmlFrameParser._build_doc
790 """
791 from lxml.etree import XMLSyntaxError
792 from lxml.html import (
793 HTMLParser,
794 fromstring,
795 parse,
796 )
798 parser = HTMLParser(recover=True, encoding=self.encoding)
800 try:
801 if is_url(self.io):
802 with urlopen(self.io) as f:
803 r = parse(f, parser=parser)
804 else:
805 # try to parse the input in the simplest way
806 r = parse(self.io, parser=parser)
807 try:
808 r = r.getroot()
809 except AttributeError:
810 pass
811 except (UnicodeDecodeError, OSError) as e:
812 # if the input is a blob of html goop
813 if not is_url(self.io):
814 r = fromstring(self.io, parser=parser)
816 try:
817 r = r.getroot()
818 except AttributeError:
819 pass
820 else:
821 raise e
822 else:
823 if not hasattr(r, "text_content"):
824 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
826 for br in r.xpath("*//br"):
827 br.tail = "\n" + (br.tail or "")
829 return r
831 def _parse_thead_tr(self, table):
832 rows = []
834 for thead in table.xpath(".//thead"):
835 rows.extend(thead.xpath("./tr"))
837 # HACK: lxml does not clean up the clearly-erroneous
838 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
839 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
840 # children as though it's a <tr>.
841 #
842 # Better solution would be to use html5lib.
843 elements_at_root = thead.xpath("./td|./th")
844 if elements_at_root:
845 rows.append(thead)
847 return rows
849 def _parse_tbody_tr(self, table):
850 from_tbody = table.xpath(".//tbody//tr")
851 from_root = table.xpath("./tr")
852 # HTML spec: at most one of these lists has content
853 return from_tbody + from_root
855 def _parse_tfoot_tr(self, table):
856 return table.xpath(".//tfoot//tr")
859def _expand_elements(body):
860 data = [len(elem) for elem in body]
861 lens = create_series_with_explicit_dtype(data, dtype_if_empty=object)
862 lens_max = lens.max()
863 not_max = lens[lens != lens_max]
865 empty = [""]
866 for ind, length in not_max.items():
867 body[ind] += empty * (lens_max - length)
870def _data_to_frame(**kwargs):
871 head, body, foot = kwargs.pop("data")
872 header = kwargs.pop("header")
873 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
874 if head:
875 body = head + body
877 # Infer header when there is a <thead> or top <th>-only rows
878 if header is None:
879 if len(head) == 1:
880 header = 0
881 else:
882 # ignore all-empty-text rows
883 header = [i for i, row in enumerate(head) if any(text for text in row)]
885 if foot:
886 body += foot
888 # fill out elements of body that are "ragged"
889 _expand_elements(body)
890 with TextParser(body, header=header, **kwargs) as tp:
891 return tp.read()
894_valid_parsers = {
895 "lxml": _LxmlFrameParser,
896 None: _LxmlFrameParser,
897 "html5lib": _BeautifulSoupHtml5LibFrameParser,
898 "bs4": _BeautifulSoupHtml5LibFrameParser,
899}
902def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:
903 """
904 Choose the parser based on the input flavor.
906 Parameters
907 ----------
908 flavor : str
909 The type of parser to use. This must be a valid backend.
911 Returns
912 -------
913 cls : _HtmlFrameParser subclass
914 The parser class based on the requested input flavor.
916 Raises
917 ------
918 ValueError
919 * If `flavor` is not a valid backend.
920 ImportError
921 * If you do not have the requested `flavor`
922 """
923 valid_parsers = list(_valid_parsers.keys())
924 if flavor not in valid_parsers:
925 raise ValueError(
926 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"
927 )
929 if flavor in ("bs4", "html5lib"):
930 if not _HAS_HTML5LIB:
931 raise ImportError("html5lib not found, please install it")
932 if not _HAS_BS4:
933 raise ImportError("BeautifulSoup4 (bs4) not found, please install it")
934 # Although we call this above, we want to raise here right before use.
935 bs4 = import_optional_dependency("bs4") # noqa:F841
937 else:
938 if not _HAS_LXML:
939 raise ImportError("lxml not found, please install it")
940 return _valid_parsers[flavor]
943def _print_as_set(s) -> str:
944 arg = ", ".join([pprint_thing(el) for el in s])
945 return f"{{{arg}}}"
948def _validate_flavor(flavor):
949 if flavor is None:
950 flavor = "lxml", "bs4"
951 elif isinstance(flavor, str):
952 flavor = (flavor,)
953 elif isinstance(flavor, abc.Iterable):
954 if not all(isinstance(flav, str) for flav in flavor):
955 raise TypeError(
956 f"Object of type {repr(type(flavor).__name__)} "
957 f"is not an iterable of strings"
958 )
959 else:
960 msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
961 msg += " is not a valid flavor"
962 raise ValueError(msg)
964 flavor = tuple(flavor)
965 valid_flavors = set(_valid_parsers)
966 flavor_set = set(flavor)
968 if not flavor_set & valid_flavors:
969 raise ValueError(
970 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
971 f"flavors are {_print_as_set(valid_flavors)}"
972 )
973 return flavor
976def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):
977 flavor = _validate_flavor(flavor)
978 compiled_match = re.compile(match) # you can pass a compiled regex here
980 retained = None
981 for flav in flavor:
982 parser = _parser_dispatch(flav)
983 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
985 try:
986 tables = p.parse_tables()
987 except ValueError as caught:
988 # if `io` is an io-like object, check if it's seekable
989 # and try to rewind it before trying the next parser
990 if hasattr(io, "seekable") and io.seekable():
991 io.seek(0)
992 elif hasattr(io, "seekable") and not io.seekable():
993 # if we couldn't rewind it, let the user know
994 raise ValueError(
995 f"The flavor {flav} failed to parse your input. "
996 "Since you passed a non-rewindable file "
997 "object, we can't rewind it to try "
998 "another parser. Try read_html() with a different flavor."
999 ) from caught
1001 retained = caught
1002 else:
1003 break
1004 else:
1005 assert retained is not None # for mypy
1006 raise retained
1008 ret = []
1009 for table in tables:
1010 try:
1011 df = _data_to_frame(data=table, **kwargs)
1012 # Cast MultiIndex header to an Index of tuples when extracting header
1013 # links and replace nan with None (therefore can't use mi.to_flat_index()).
1014 # This maintains consistency of selection (e.g. df.columns.str[1])
1015 if extract_links in ("all", "header") and isinstance(
1016 df.columns, MultiIndex
1017 ):
1018 df.columns = Index(
1019 ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
1020 tupleize_cols=False,
1021 )
1023 ret.append(df)
1024 except EmptyDataError: # empty table
1025 continue
1026 return ret
1029@deprecate_nonkeyword_arguments(version="2.0")
1030def read_html(
1031 io: FilePath | ReadBuffer[str],
1032 match: str | Pattern = ".+",
1033 flavor: str | None = None,
1034 header: int | Sequence[int] | None = None,
1035 index_col: int | Sequence[int] | None = None,
1036 skiprows: int | Sequence[int] | slice | None = None,
1037 attrs: dict[str, str] | None = None,
1038 parse_dates: bool = False,
1039 thousands: str | None = ",",
1040 encoding: str | None = None,
1041 decimal: str = ".",
1042 converters: dict | None = None,
1043 na_values: Iterable[object] | None = None,
1044 keep_default_na: bool = True,
1045 displayed_only: bool = True,
1046 extract_links: Literal[None, "header", "footer", "body", "all"] = None,
1047) -> list[DataFrame]:
1048 r"""
1049 Read HTML tables into a ``list`` of ``DataFrame`` objects.
1051 Parameters
1052 ----------
1053 io : str, path object, or file-like object
1054 String, path object (implementing ``os.PathLike[str]``), or file-like
1055 object implementing a string ``read()`` function.
1056 The string can represent a URL or the HTML itself. Note that
1057 lxml only accepts the http, ftp and file url protocols. If you have a
1058 URL that starts with ``'https'`` you might try removing the ``'s'``.
1060 match : str or compiled regular expression, optional
1061 The set of tables containing text matching this regex or string will be
1062 returned. Unless the HTML is extremely simple you will probably need to
1063 pass a non-empty string here. Defaults to '.+' (match any non-empty
1064 string). The default value will return all tables contained on a page.
1065 This value is converted to a regular expression so that there is
1066 consistent behavior between Beautiful Soup and lxml.
1068 flavor : str, optional
1069 The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
1070 each other, they are both there for backwards compatibility. The
1071 default of ``None`` tries to use ``lxml`` to parse and if that fails it
1072 falls back on ``bs4`` + ``html5lib``.
1074 header : int or list-like, optional
1075 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
1076 make the columns headers.
1078 index_col : int or list-like, optional
1079 The column (or list of columns) to use to create the index.
1081 skiprows : int, list-like or slice, optional
1082 Number of rows to skip after parsing the column integer. 0-based. If a
1083 sequence of integers or a slice is given, will skip the rows indexed by
1084 that sequence. Note that a single element sequence means 'skip the nth
1085 row' whereas an integer means 'skip n rows'.
1087 attrs : dict, optional
1088 This is a dictionary of attributes that you can pass to use to identify
1089 the table in the HTML. These are not checked for validity before being
1090 passed to lxml or Beautiful Soup. However, these attributes must be
1091 valid HTML table attributes to work correctly. For example, ::
1093 attrs = {'id': 'table'}
1095 is a valid attribute dictionary because the 'id' HTML tag attribute is
1096 a valid HTML attribute for *any* HTML tag as per `this document
1097 <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::
1099 attrs = {'asdf': 'table'}
1101 is *not* a valid attribute dictionary because 'asdf' is not a valid
1102 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
1103 table attributes can be found `here
1104 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
1105 working draft of the HTML 5 spec can be found `here
1106 <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
1107 latest information on table attributes for the modern web.
1109 parse_dates : bool, optional
1110 See :func:`~read_csv` for more details.
1112 thousands : str, optional
1113 Separator to use to parse thousands. Defaults to ``','``.
1115 encoding : str, optional
1116 The encoding used to decode the web page. Defaults to ``None``.``None``
1117 preserves the previous encoding behavior, which depends on the
1118 underlying parser library (e.g., the parser library will try to use
1119 the encoding provided by the document).
1121 decimal : str, default '.'
1122 Character to recognize as decimal point (e.g. use ',' for European
1123 data).
1125 converters : dict, default None
1126 Dict of functions for converting values in certain columns. Keys can
1127 either be integers or column labels, values are functions that take one
1128 input argument, the cell (not column) content, and return the
1129 transformed content.
1131 na_values : iterable, default None
1132 Custom NA values.
1134 keep_default_na : bool, default True
1135 If na_values are specified and keep_default_na is False the default NaN
1136 values are overridden, otherwise they're appended to.
1138 displayed_only : bool, default True
1139 Whether elements with "display: none" should be parsed.
1141 extract_links : {None, "all", "header", "body", "footer"}
1142 Table elements in the specified section(s) with <a> tags will have their
1143 href extracted.
1145 .. versionadded:: 1.5.0
1147 Returns
1148 -------
1149 dfs
1150 A list of DataFrames.
1152 See Also
1153 --------
1154 read_csv : Read a comma-separated values (csv) file into DataFrame.
1156 Notes
1157 -----
1158 Before using this function you should read the :ref:`gotchas about the
1159 HTML parsing libraries <io.html.gotchas>`.
1161 Expect to do some cleanup after you call this function. For example, you
1162 might need to manually assign column names if the column names are
1163 converted to NaN when you pass the `header=0` argument. We try to assume as
1164 little as possible about the structure of the table and push the
1165 idiosyncrasies of the HTML contained in the table to the user.
1167 This function searches for ``<table>`` elements and only for ``<tr>``
1168 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
1169 element in the table. ``<td>`` stands for "table data". This function
1170 attempts to properly handle ``colspan`` and ``rowspan`` attributes.
1171 If the function has a ``<thead>`` argument, it is used to construct
1172 the header, otherwise the function attempts to find the header within
1173 the body (by putting rows with only ``<th>`` elements into the header).
1175 Similar to :func:`~read_csv` the `header` argument is applied
1176 **after** `skiprows` is applied.
1178 This function will *always* return a list of :class:`DataFrame` *or*
1179 it will fail, e.g., it will *not* return an empty list.
1181 Examples
1182 --------
1183 See the :ref:`read_html documentation in the IO section of the docs
1184 <io.read_html>` for some examples of reading in HTML tables.
1185 """
1186 _importers()
1188 # Type check here. We don't want to parse only to fail because of an
1189 # invalid value of an integer skiprows.
1190 if isinstance(skiprows, numbers.Integral) and skiprows < 0:
1191 raise ValueError(
1192 "cannot skip rows starting from the end of the "
1193 "data (you passed a negative value)"
1194 )
1195 if extract_links not in [None, "header", "footer", "body", "all"]:
1196 raise ValueError(
1197 "`extract_links` must be one of "
1198 '{None, "header", "footer", "body", "all"}, got '
1199 f'"{extract_links}"'
1200 )
1201 validate_header_arg(header)
1203 io = stringify_path(io)
1205 return _parse(
1206 flavor=flavor,
1207 io=io,
1208 match=match,
1209 header=header,
1210 index_col=index_col,
1211 skiprows=skiprows,
1212 parse_dates=parse_dates,
1213 thousands=thousands,
1214 attrs=attrs,
1215 encoding=encoding,
1216 decimal=decimal,
1217 converters=converters,
1218 na_values=na_values,
1219 keep_default_na=keep_default_na,
1220 displayed_only=displayed_only,
1221 extract_links=extract_links,
1222 )