Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/html.py: 16%

1"""

2:mod:`pandas.io.html` is a module containing functionality for dealing with

3HTML IO.

5"""

7from __future__ import annotations

9from collections import abc

10import numbers

11import re

12from typing import (

13 TYPE_CHECKING,

14 Iterable,

15 Literal,

16 Pattern,

17 Sequence,

18 cast,

19)

21from pandas._typing import (

22 FilePath,

23 ReadBuffer,

24)

25from pandas.compat._optional import import_optional_dependency

26from pandas.errors import (

27 AbstractMethodError,

28 EmptyDataError,

29)

30from pandas.util._decorators import deprecate_nonkeyword_arguments

32from pandas.core.dtypes.common import is_list_like

34from pandas import isna

35from pandas.core.construction import create_series_with_explicit_dtype

36from pandas.core.indexes.base import Index

37from pandas.core.indexes.multi import MultiIndex

39from pandas.io.common import (

40 file_exists,

41 get_handle,

42 is_url,

43 stringify_path,

44 urlopen,

45 validate_header_arg,

46)

47from pandas.io.formats.printing import pprint_thing

48from pandas.io.parsers import TextParser

50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true

51 from pandas import DataFrame

53_IMPORTS = False

54_HAS_BS4 = False

55_HAS_LXML = False

56_HAS_HTML5LIB = False

59def _importers() -> None:

60 # import things we need

61 # but make this done on a first use basis

63 global _IMPORTS

64 if _IMPORTS:

65 return

67 global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB

68 bs4 = import_optional_dependency("bs4", errors="ignore")

69 _HAS_BS4 = bs4 is not None

71 lxml = import_optional_dependency("lxml.etree", errors="ignore")

72 _HAS_LXML = lxml is not None

74 html5lib = import_optional_dependency("html5lib", errors="ignore")

75 _HAS_HTML5LIB = html5lib is not None

77 _IMPORTS = True

80#############

81# READ HTML #

82#############

83_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")

86def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:

87 """

88 Replace extra whitespace inside of a string with a single space.

90 Parameters

91 ----------

92 s : str or unicode

93 The string from which to remove extra whitespace.

94 regex : re.Pattern

95 The regular expression to use to remove extra whitespace.

97 Returns

98 -------

99 subd : str or unicode

100 `s` with all extra whitespace replaced with a single space.

101 """

102 return regex.sub(" ", s.strip())

103

104

105def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:

106 """

107 Get an iterator given an integer, slice or container.

108

109 Parameters

110 ----------

111 skiprows : int, slice, container

112 The iterator to use to skip rows; can also be a slice.

113

114 Raises

115 ------

116 TypeError

117 * If `skiprows` is not a slice, integer, or Container

118

119 Returns

120 -------

121 it : iterable

122 A proper iterator to use to skip rows of a DataFrame.

123 """

124 if isinstance(skiprows, slice):

125 start, step = skiprows.start or 0, skiprows.step or 1

126 return list(range(start, skiprows.stop, step))

127 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):

128 return cast("int | Sequence[int]", skiprows)

129 elif skiprows is None:

130 return 0

131 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")

132

133

134def _read(

135 obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None

136) -> str | bytes:

137 """

138 Try to read from a url, file or string.

139

140 Parameters

141 ----------

142 obj : str, unicode, path object, or file-like object

143

144 Returns

145 -------

146 raw_text : str

147 """

148 text: str | bytes

149 if (

150 is_url(obj)

151 or hasattr(obj, "read")

152 or (isinstance(obj, str) and file_exists(obj))

153 ):

154 # error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes,

155 # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]";

156 # expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase,

157 # BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]"

158 with get_handle(

159 obj, "r", encoding=encoding # type: ignore[arg-type]

160 ) as handles:

161 text = handles.handle.read()

162 elif isinstance(obj, (str, bytes)):

163 text = obj

164 else:

165 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")

166 return text

167

168

169class _HtmlFrameParser:

170 """

171 Base class for parsers that parse HTML into DataFrames.

172

173 Parameters

174 ----------

175 io : str or file-like

176 This can be either a string of raw HTML, a valid URL using the HTTP,

177 FTP, or FILE protocols or a file-like object.

178

179 match : str or regex

180 The text to match in the document.

181

182 attrs : dict

183 List of HTML <table> element attributes to match.

184

185 encoding : str

186 Encoding to be used by parser

187

188 displayed_only : bool

189 Whether or not items with "display:none" should be ignored

190

191 extract_links : {None, "all", "header", "body", "footer"}

192 Table elements in the specified section(s) with <a> tags will have their

193 href extracted.

194

195 .. versionadded:: 1.5.0

196

197 Attributes

198 ----------

199 io : str or file-like

200 raw HTML, URL, or file-like object

201

202 match : regex

203 The text to match in the raw HTML

204

205 attrs : dict-like

206 A dictionary of valid table attributes to use to search for table

207 elements.

208

209 encoding : str

210 Encoding to be used by parser

211

212 displayed_only : bool

213 Whether or not items with "display:none" should be ignored

214

215 extract_links : {None, "all", "header", "body", "footer"}

216 Table elements in the specified section(s) with <a> tags will have their

217 href extracted.

218

219 .. versionadded:: 1.5.0

220

221 Notes

222 -----

223 To subclass this class effectively you must override the following methods:

224 * :func:`_build_doc`

225 * :func:`_attr_getter`

226 * :func:`_href_getter`

227 * :func:`_text_getter`

228 * :func:`_parse_td`

229 * :func:`_parse_thead_tr`

230 * :func:`_parse_tbody_tr`

231 * :func:`_parse_tfoot_tr`

232 * :func:`_parse_tables`

233 * :func:`_equals_tag`

234 See each method's respective documentation for details on their

235 functionality.

236 """

237

238 def __init__(

239 self,

240 io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],

241 match: str | Pattern,

242 attrs: dict[str, str] | None,

243 encoding: str,

244 displayed_only: bool,

245 extract_links: Literal[None, "header", "footer", "body", "all"],

246 ) -> None:

247 self.io = io

248 self.match = match

249 self.attrs = attrs

250 self.encoding = encoding

251 self.displayed_only = displayed_only

252 self.extract_links = extract_links

253

254 def parse_tables(self):

255 """

256 Parse and return all tables from the DOM.

257

258 Returns

259 -------

260 list of parsed (header, body, footer) tuples from tables.

261 """

262 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)

263 return (self._parse_thead_tbody_tfoot(table) for table in tables)

264

265 def _attr_getter(self, obj, attr):

266 """

267 Return the attribute value of an individual DOM node.

268

269 Parameters

270 ----------

271 obj : node-like

272 A DOM node.

273

274 attr : str or unicode

275 The attribute, such as "colspan"

276

277 Returns

278 -------

279 str or unicode

280 The attribute value.

281 """

282 # Both lxml and BeautifulSoup have the same implementation:

283 return obj.get(attr)

284

285 def _href_getter(self, obj):

286 """

287 Return a href if the DOM node contains a child <a> or None.

288

289 Parameters

290 ----------

291 obj : node-like

292 A DOM node.

293

294 Returns

295 -------

296 href : str or unicode

297 The href from the <a> child of the DOM node.

298 """

299 raise AbstractMethodError(self)

300

301 def _text_getter(self, obj):

302 """

303 Return the text of an individual DOM node.

304

305 Parameters

306 ----------

307 obj : node-like

308 A DOM node.

309

310 Returns

311 -------

312 text : str or unicode

313 The text from an individual DOM node.

314 """

315 raise AbstractMethodError(self)

316

317 def _parse_td(self, obj):

318 """

319 Return the td elements from a row element.

320

321 Parameters

322 ----------

323 obj : node-like

324 A DOM <tr> node.

325

326 Returns

327 -------

328 list of node-like

329 These are the elements of each row, i.e., the columns.

330 """

331 raise AbstractMethodError(self)

332

333 def _parse_thead_tr(self, table):

334 """

335 Return the list of thead row elements from the parsed table element.

336

337 Parameters

338 ----------

339 table : a table element that contains zero or more thead elements.

340

341 Returns

342 -------

343 list of node-like

344 These are the <tr> row elements of a table.

345 """

346 raise AbstractMethodError(self)

347

348 def _parse_tbody_tr(self, table):

349 """

350 Return the list of tbody row elements from the parsed table element.

351

352 HTML5 table bodies consist of either 0 or more <tbody> elements (which

353 only contain <tr> elements) or 0 or more <tr> elements. This method

354 checks for both structures.

355

356 Parameters

357 ----------

358 table : a table element that contains row elements.

359

360 Returns

361 -------

362 list of node-like

363 These are the <tr> row elements of a table.

364 """

365 raise AbstractMethodError(self)

366

367 def _parse_tfoot_tr(self, table):

368 """

369 Return the list of tfoot row elements from the parsed table element.

370

371 Parameters

372 ----------

373 table : a table element that contains row elements.

374

375 Returns

376 -------

377 list of node-like

378 These are the <tr> row elements of a table.

379 """

380 raise AbstractMethodError(self)

381

382 def _parse_tables(self, doc, match, attrs):

383 """

384 Return all tables from the parsed DOM.

385

386 Parameters

387 ----------

388 doc : the DOM from which to parse the table element.

389

390 match : str or regular expression

391 The text to search for in the DOM tree.

392

393 attrs : dict

394 A dictionary of table attributes that can be used to disambiguate

395 multiple tables on a page.

396

397 Raises

398 ------

399 ValueError : `match` does not match any text in the document.

400

401 Returns

402 -------

403 list of node-like

404 HTML <table> elements to be parsed into raw data.

405 """

406 raise AbstractMethodError(self)

407

408 def _equals_tag(self, obj, tag):

409 """

410 Return whether an individual DOM node matches a tag

411

412 Parameters

413 ----------

414 obj : node-like

415 A DOM node.

416

417 tag : str

418 Tag name to be checked for equality.

419

420 Returns

421 -------

422 boolean

423 Whether `obj`'s tag name is `tag`

424 """

425 raise AbstractMethodError(self)

426

427 def _build_doc(self):

428 """

429 Return a tree-like object that can be used to iterate over the DOM.

430

431 Returns

432 -------

433 node-like

434 The DOM from which to parse the table element.

435 """

436 raise AbstractMethodError(self)

437

438 def _parse_thead_tbody_tfoot(self, table_html):

439 """

440 Given a table, return parsed header, body, and foot.

441

442 Parameters

443 ----------

444 table_html : node-like

445

446 Returns

447 -------

448 tuple of (header, body, footer), each a list of list-of-text rows.

449

450 Notes

451 -----

452 Header and body are lists-of-lists. Top level list is a list of

453 rows. Each row is a list of str text.

454

455 Logic: Use <thead>, <tbody>, <tfoot> elements to identify

456 header, body, and footer, otherwise:

457 - Put all rows into body

458 - Move rows from top of body to header only if

459 all elements inside row are <th>

460 - Move rows from bottom of body to footer only if

461 all elements inside row are <th>

462 """

463 header_rows = self._parse_thead_tr(table_html)

464 body_rows = self._parse_tbody_tr(table_html)

465 footer_rows = self._parse_tfoot_tr(table_html)

466

467 def row_is_all_th(row):

468 return all(self._equals_tag(t, "th") for t in self._parse_td(row))

469

470 if not header_rows:

471 # The table has no <thead>. Move the top all-<th> rows from

472 # body_rows to header_rows. (This is a common case because many

473 # tables in the wild have no <thead> or <tfoot>

474 while body_rows and row_is_all_th(body_rows[0]):

475 header_rows.append(body_rows.pop(0))

476

477 header = self._expand_colspan_rowspan(header_rows, section="header")

478 body = self._expand_colspan_rowspan(body_rows, section="body")

479 footer = self._expand_colspan_rowspan(footer_rows, section="footer")

480

481 return header, body, footer

482

483 def _expand_colspan_rowspan(

484 self, rows, section: Literal["header", "footer", "body"]

485 ):

486 """

487 Given a list of <tr>s, return a list of text rows.

488

489 Parameters

490 ----------

491 rows : list of node-like

492 List of <tr>s

493 section : the section that the rows belong to (header, body or footer).

494

495 Returns

496 -------

497 list of list

498 Each returned row is a list of str text, or tuple (text, link)

499 if extract_links is not None.

500

501 Notes

502 -----

503 Any cell with ``rowspan`` or ``colspan`` will have its contents copied

504 to subsequent cells.

505 """

506 all_texts = [] # list of rows, each a list of str

507 text: str | tuple

508 remainder: list[

509 tuple[int, str | tuple, int]

510 ] = [] # list of (index, text, nrows)

511

512 for tr in rows:

513 texts = [] # the output for this row

514 next_remainder = []

515

516 index = 0

517 tds = self._parse_td(tr)

518 for td in tds:

519 # Append texts from previous rows with rowspan>1 that come

520 # before this <td>

521 while remainder and remainder[0][0] <= index:

522 prev_i, prev_text, prev_rowspan = remainder.pop(0)

523 texts.append(prev_text)

524 if prev_rowspan > 1:

525 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))

526 index += 1

527

528 # Append the text from this <td>, colspan times

529 text = _remove_whitespace(self._text_getter(td))

530 if self.extract_links == "all" or self.extract_links == section:

531 href = self._href_getter(td)

532 text = (text, href)

533 rowspan = int(self._attr_getter(td, "rowspan") or 1)

534 colspan = int(self._attr_getter(td, "colspan") or 1)

535

536 for _ in range(colspan):

537 texts.append(text)

538 if rowspan > 1:

539 next_remainder.append((index, text, rowspan - 1))

540 index += 1

541

542 # Append texts from previous rows at the final position

543 for prev_i, prev_text, prev_rowspan in remainder:

544 texts.append(prev_text)

545 if prev_rowspan > 1:

546 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))

547

548 all_texts.append(texts)

549 remainder = next_remainder

550

551 # Append rows that only appear because the previous row had non-1

552 # rowspan

553 while remainder:

554 next_remainder = []

555 texts = []

556 for prev_i, prev_text, prev_rowspan in remainder:

557 texts.append(prev_text)

558 if prev_rowspan > 1:

559 next_remainder.append((prev_i, prev_text, prev_rowspan - 1))

560 all_texts.append(texts)

561 remainder = next_remainder

562

563 return all_texts

564

565 def _handle_hidden_tables(self, tbl_list, attr_name):

566 """

567 Return list of tables, potentially removing hidden elements

568

569 Parameters

570 ----------

571 tbl_list : list of node-like

572 Type of list elements will vary depending upon parser used

573 attr_name : str

574 Name of the accessor for retrieving HTML attributes

575

576 Returns

577 -------

578 list of node-like

579 Return type matches `tbl_list`

580 """

581 if not self.displayed_only:

582 return tbl_list

583

584 return [

585 x

586 for x in tbl_list

587 if "display:none"

588 not in getattr(x, attr_name).get("style", "").replace(" ", "")

589 ]

590

591

592class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):

593 """

594 HTML to DataFrame parser that uses BeautifulSoup under the hood.

595

596 See Also

597 --------

598 pandas.io.html._HtmlFrameParser

599 pandas.io.html._LxmlFrameParser

600

601 Notes

602 -----

603 Documentation strings for this class are in the base class

604 :class:`pandas.io.html._HtmlFrameParser`.

605 """

606

607 def __init__(self, *args, **kwargs) -> None:

608 super().__init__(*args, **kwargs)

609 from bs4 import SoupStrainer

610

611 self._strainer = SoupStrainer("table")

612

613 def _parse_tables(self, doc, match, attrs):

614 element_name = self._strainer.name

615 tables = doc.find_all(element_name, attrs=attrs)

616

617 if not tables:

618 raise ValueError("No tables found")

619

620 result = []

621 unique_tables = set()

622 tables = self._handle_hidden_tables(tables, "attrs")

623

624 for table in tables:

625 if self.displayed_only:

626 for elem in table.find_all(style=re.compile(r"display:\s*none")):

627 elem.decompose()

628

629 if table not in unique_tables and table.find(string=match) is not None:

630 result.append(table)

631 unique_tables.add(table)

632

633 if not result:

634 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")

635 return result

636

637 def _href_getter(self, obj) -> str | None:

638 a = obj.find("a", href=True)

639 return None if not a else a["href"]

640

641 def _text_getter(self, obj):

642 return obj.text

643

644 def _equals_tag(self, obj, tag):

645 return obj.name == tag

646

647 def _parse_td(self, row):

648 return row.find_all(("td", "th"), recursive=False)

649

650 def _parse_thead_tr(self, table):

651 return table.select("thead tr")

652

653 def _parse_tbody_tr(self, table):

654 from_tbody = table.select("tbody tr")

655 from_root = table.find_all("tr", recursive=False)

656 # HTML spec: at most one of these lists has content

657 return from_tbody + from_root

658

659 def _parse_tfoot_tr(self, table):

660 return table.select("tfoot tr")

661

662 def _setup_build_doc(self):

663 raw_text = _read(self.io, self.encoding)

664 if not raw_text:

665 raise ValueError(f"No text parsed from document: {self.io}")

666 return raw_text

667

668 def _build_doc(self):

669 from bs4 import BeautifulSoup

670

671 bdoc = self._setup_build_doc()

672 if isinstance(bdoc, bytes) and self.encoding is not None:

673 udoc = bdoc.decode(self.encoding)

674 from_encoding = None

675 else:

676 udoc = bdoc

677 from_encoding = self.encoding

678

679 soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)

680

681 for br in soup.find_all("br"):

682 br.replace_with("\n" + br.text)

683

684 return soup

685

686

687def _build_xpath_expr(attrs) -> str:

688 """

689 Build an xpath expression to simulate bs4's ability to pass in kwargs to

690 search for attributes when using the lxml parser.

691

692 Parameters

693 ----------

694 attrs : dict

695 A dict of HTML attributes. These are NOT checked for validity.

696

697 Returns

698 -------

699 expr : unicode

700 An XPath expression that checks for the given HTML attributes.

701 """

702 # give class attribute as class_ because class is a python keyword

703 if "class_" in attrs:

704 attrs["class"] = attrs.pop("class_")

705

706 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])

707 return f"[{s}]"

708

709

710_re_namespace = {"re": "http://exslt.org/regular-expressions"}

711

712

713class _LxmlFrameParser(_HtmlFrameParser):

714 """

715 HTML to DataFrame parser that uses lxml under the hood.

716

717 Warning

718 -------

719 This parser can only handle HTTP, FTP, and FILE urls.

720

721 See Also

722 --------

723 _HtmlFrameParser

724 _BeautifulSoupLxmlFrameParser

725

726 Notes

727 -----

728 Documentation strings for this class are in the base class

729 :class:`_HtmlFrameParser`.

730 """

731

732 def _href_getter(self, obj) -> str | None:

733 href = obj.xpath(".//a/@href")

734 return None if not href else href[0]

735

736 def _text_getter(self, obj):

737 return obj.text_content()

738

739 def _parse_td(self, row):

740 # Look for direct children only: the "row" element here may be a

741 # <thead> or <tfoot> (see _parse_thead_tr).

742 return row.xpath("./td|./th")

743

744 def _parse_tables(self, doc, match, kwargs):

745 pattern = match.pattern

746

747 # 1. check all descendants for the given pattern and only search tables

748 # 2. go up the tree until we find a table

749 xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table"

750

751 # if any table attributes were given build an xpath expression to

752 # search for them

753 if kwargs:

754 xpath_expr += _build_xpath_expr(kwargs)

755

756 tables = doc.xpath(xpath_expr, namespaces=_re_namespace)

757

758 tables = self._handle_hidden_tables(tables, "attrib")

759 if self.displayed_only:

760 for table in tables:

761 # lxml utilizes XPATH 1.0 which does not have regex

762 # support. As a result, we find all elements with a style

763 # attribute and iterate them to check for display:none

764 for elem in table.xpath(".//*[@style]"):

765 if "display:none" in elem.attrib.get("style", "").replace(" ", ""):

766 elem.getparent().remove(elem)

767

768 if not tables:

769 raise ValueError(f"No tables found matching regex {repr(pattern)}")

770 return tables

771

772 def _equals_tag(self, obj, tag):

773 return obj.tag == tag

774

775 def _build_doc(self):

776 """

777 Raises

778 ------

779 ValueError

780 * If a URL that lxml cannot parse is passed.

781

782 Exception

783 * Any other ``Exception`` thrown. For example, trying to parse a

784 URL that is syntactically correct on a machine with no internet

785 connection will fail.

786

787 See Also

788 --------

789 pandas.io.html._HtmlFrameParser._build_doc

790 """

791 from lxml.etree import XMLSyntaxError

792 from lxml.html import (

793 HTMLParser,

794 fromstring,

795 parse,

796 )

797

798 parser = HTMLParser(recover=True, encoding=self.encoding)

799

800 try:

801 if is_url(self.io):

802 with urlopen(self.io) as f:

803 r = parse(f, parser=parser)

804 else:

805 # try to parse the input in the simplest way

806 r = parse(self.io, parser=parser)

807 try:

808 r = r.getroot()

809 except AttributeError:

810 pass

811 except (UnicodeDecodeError, OSError) as e:

812 # if the input is a blob of html goop

813 if not is_url(self.io):

814 r = fromstring(self.io, parser=parser)

815

816 try:

817 r = r.getroot()

818 except AttributeError:

819 pass

820 else:

821 raise e

822 else:

823 if not hasattr(r, "text_content"):

824 raise XMLSyntaxError("no text parsed from document", 0, 0, 0)

825

826 for br in r.xpath("*//br"):

827 br.tail = "\n" + (br.tail or "")

828

829 return r

830

831 def _parse_thead_tr(self, table):

832 rows = []

833

834 for thead in table.xpath(".//thead"):

835 rows.extend(thead.xpath("./tr"))

836

837 # HACK: lxml does not clean up the clearly-erroneous

838 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add

839 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its

840 # children as though it's a <tr>.

841 #

842 # Better solution would be to use html5lib.

843 elements_at_root = thead.xpath("./td|./th")

844 if elements_at_root:

845 rows.append(thead)

846

847 return rows

848

849 def _parse_tbody_tr(self, table):

850 from_tbody = table.xpath(".//tbody//tr")

851 from_root = table.xpath("./tr")

852 # HTML spec: at most one of these lists has content

853 return from_tbody + from_root

854

855 def _parse_tfoot_tr(self, table):

856 return table.xpath(".//tfoot//tr")

857

858

859def _expand_elements(body):

860 data = [len(elem) for elem in body]

861 lens = create_series_with_explicit_dtype(data, dtype_if_empty=object)

862 lens_max = lens.max()

863 not_max = lens[lens != lens_max]

864

865 empty = [""]

866 for ind, length in not_max.items():

867 body[ind] += empty * (lens_max - length)

868

869

870def _data_to_frame(**kwargs):

871 head, body, foot = kwargs.pop("data")

872 header = kwargs.pop("header")

873 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])

874 if head:

875 body = head + body

876

877 # Infer header when there is a <thead> or top <th>-only rows

878 if header is None:

879 if len(head) == 1:

880 header = 0

881 else:

882 # ignore all-empty-text rows

883 header = [i for i, row in enumerate(head) if any(text for text in row)]

884

885 if foot:

886 body += foot

887

888 # fill out elements of body that are "ragged"

889 _expand_elements(body)

890 with TextParser(body, header=header, **kwargs) as tp:

891 return tp.read()

892

893

894_valid_parsers = {

895 "lxml": _LxmlFrameParser,

896 None: _LxmlFrameParser,

897 "html5lib": _BeautifulSoupHtml5LibFrameParser,

898 "bs4": _BeautifulSoupHtml5LibFrameParser,

899}

900

901

902def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:

903 """

904 Choose the parser based on the input flavor.

905

906 Parameters

907 ----------

908 flavor : str

909 The type of parser to use. This must be a valid backend.

910

911 Returns

912 -------

913 cls : _HtmlFrameParser subclass

914 The parser class based on the requested input flavor.

915

916 Raises

917 ------

918 ValueError

919 * If `flavor` is not a valid backend.

920 ImportError

921 * If you do not have the requested `flavor`

922 """

923 valid_parsers = list(_valid_parsers.keys())

924 if flavor not in valid_parsers:

925 raise ValueError(

926 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"

927 )

928

929 if flavor in ("bs4", "html5lib"):

930 if not _HAS_HTML5LIB:

931 raise ImportError("html5lib not found, please install it")

932 if not _HAS_BS4:

933 raise ImportError("BeautifulSoup4 (bs4) not found, please install it")

934 # Although we call this above, we want to raise here right before use.

935 bs4 = import_optional_dependency("bs4") # noqa:F841

936

937 else:

938 if not _HAS_LXML:

939 raise ImportError("lxml not found, please install it")

940 return _valid_parsers[flavor]

941

942

943def _print_as_set(s) -> str:

944 arg = ", ".join([pprint_thing(el) for el in s])

945 return f"{{{arg}}}"

946

947

948def _validate_flavor(flavor):

949 if flavor is None:

950 flavor = "lxml", "bs4"

951 elif isinstance(flavor, str):

952 flavor = (flavor,)

953 elif isinstance(flavor, abc.Iterable):

954 if not all(isinstance(flav, str) for flav in flavor):

955 raise TypeError(

956 f"Object of type {repr(type(flavor).__name__)} "

957 f"is not an iterable of strings"

958 )

959 else:

960 msg = repr(flavor) if isinstance(flavor, str) else str(flavor)

961 msg += " is not a valid flavor"

962 raise ValueError(msg)

963

964 flavor = tuple(flavor)

965 valid_flavors = set(_valid_parsers)

966 flavor_set = set(flavor)

967

968 if not flavor_set & valid_flavors:

969 raise ValueError(

970 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "

971 f"flavors are {_print_as_set(valid_flavors)}"

972 )

973 return flavor

974

975

976def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):

977 flavor = _validate_flavor(flavor)

978 compiled_match = re.compile(match) # you can pass a compiled regex here

979

980 retained = None

981 for flav in flavor:

982 parser = _parser_dispatch(flav)

983 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)

984

985 try:

986 tables = p.parse_tables()

987 except ValueError as caught:

988 # if `io` is an io-like object, check if it's seekable

989 # and try to rewind it before trying the next parser

990 if hasattr(io, "seekable") and io.seekable():

991 io.seek(0)

992 elif hasattr(io, "seekable") and not io.seekable():

993 # if we couldn't rewind it, let the user know

994 raise ValueError(

995 f"The flavor {flav} failed to parse your input. "

996 "Since you passed a non-rewindable file "

997 "object, we can't rewind it to try "

998 "another parser. Try read_html() with a different flavor."

999 ) from caught

1000

1001 retained = caught

1002 else:

1003 break

1004 else:

1005 assert retained is not None # for mypy

1006 raise retained

1007

1008 ret = []

1009 for table in tables:

1010 try:

1011 df = _data_to_frame(data=table, **kwargs)

1012 # Cast MultiIndex header to an Index of tuples when extracting header

1013 # links and replace nan with None (therefore can't use mi.to_flat_index()).

1014 # This maintains consistency of selection (e.g. df.columns.str[1])

1015 if extract_links in ("all", "header") and isinstance(

1016 df.columns, MultiIndex

1017 ):

1018 df.columns = Index(

1019 ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),

1020 tupleize_cols=False,

1021 )

1022

1023 ret.append(df)

1024 except EmptyDataError: # empty table

1025 continue

1026 return ret

1027

1028

1029@deprecate_nonkeyword_arguments(version="2.0")

1030def read_html(

1031 io: FilePath | ReadBuffer[str],

1032 match: str | Pattern = ".+",

1033 flavor: str | None = None,

1034 header: int | Sequence[int] | None = None,

1035 index_col: int | Sequence[int] | None = None,

1036 skiprows: int | Sequence[int] | slice | None = None,

1037 attrs: dict[str, str] | None = None,

1038 parse_dates: bool = False,

1039 thousands: str | None = ",",

1040 encoding: str | None = None,

1041 decimal: str = ".",

1042 converters: dict | None = None,

1043 na_values: Iterable[object] | None = None,

1044 keep_default_na: bool = True,

1045 displayed_only: bool = True,

1046 extract_links: Literal[None, "header", "footer", "body", "all"] = None,

1047) -> list[DataFrame]:

1048 r"""

1049 Read HTML tables into a ``list`` of ``DataFrame`` objects.

1050

1051 Parameters

1052 ----------

1053 io : str, path object, or file-like object

1054 String, path object (implementing ``os.PathLike[str]``), or file-like

1055 object implementing a string ``read()`` function.

1056 The string can represent a URL or the HTML itself. Note that

1057 lxml only accepts the http, ftp and file url protocols. If you have a

1058 URL that starts with ``'https'`` you might try removing the ``'s'``.

1059

1060 match : str or compiled regular expression, optional

1061 The set of tables containing text matching this regex or string will be

1062 returned. Unless the HTML is extremely simple you will probably need to

1063 pass a non-empty string here. Defaults to '.+' (match any non-empty

1064 string). The default value will return all tables contained on a page.

1065 This value is converted to a regular expression so that there is

1066 consistent behavior between Beautiful Soup and lxml.

1067

1068 flavor : str, optional

1069 The parsing engine to use. 'bs4' and 'html5lib' are synonymous with

1070 each other, they are both there for backwards compatibility. The

1071 default of ``None`` tries to use ``lxml`` to parse and if that fails it

1072 falls back on ``bs4`` + ``html5lib``.

1073

1074 header : int or list-like, optional

1075 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to

1076 make the columns headers.

1077

1078 index_col : int or list-like, optional

1079 The column (or list of columns) to use to create the index.

1080

1081 skiprows : int, list-like or slice, optional

1082 Number of rows to skip after parsing the column integer. 0-based. If a

1083 sequence of integers or a slice is given, will skip the rows indexed by

1084 that sequence. Note that a single element sequence means 'skip the nth

1085 row' whereas an integer means 'skip n rows'.

1086

1087 attrs : dict, optional

1088 This is a dictionary of attributes that you can pass to use to identify

1089 the table in the HTML. These are not checked for validity before being

1090 passed to lxml or Beautiful Soup. However, these attributes must be

1091 valid HTML table attributes to work correctly. For example, ::

1092

1093 attrs = {'id': 'table'}

1094

1095 is a valid attribute dictionary because the 'id' HTML tag attribute is

1096 a valid HTML attribute for *any* HTML tag as per `this document

1097 <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::

1098

1099 attrs = {'asdf': 'table'}

1100

1101 is *not* a valid attribute dictionary because 'asdf' is not a valid

1102 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01

1103 table attributes can be found `here

1104 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A

1105 working draft of the HTML 5 spec can be found `here

1106 <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the

1107 latest information on table attributes for the modern web.

1108

1109 parse_dates : bool, optional

1110 See :func:`~read_csv` for more details.

1111

1112 thousands : str, optional

1113 Separator to use to parse thousands. Defaults to ``','``.

1114

1115 encoding : str, optional

1116 The encoding used to decode the web page. Defaults to ``None``.``None``

1117 preserves the previous encoding behavior, which depends on the

1118 underlying parser library (e.g., the parser library will try to use

1119 the encoding provided by the document).

1120

1121 decimal : str, default '.'

1122 Character to recognize as decimal point (e.g. use ',' for European

1123 data).

1124

1125 converters : dict, default None

1126 Dict of functions for converting values in certain columns. Keys can

1127 either be integers or column labels, values are functions that take one

1128 input argument, the cell (not column) content, and return the

1129 transformed content.

1130

1131 na_values : iterable, default None

1132 Custom NA values.

1133

1134 keep_default_na : bool, default True

1135 If na_values are specified and keep_default_na is False the default NaN

1136 values are overridden, otherwise they're appended to.

1137

1138 displayed_only : bool, default True

1139 Whether elements with "display: none" should be parsed.

1140

1141 extract_links : {None, "all", "header", "body", "footer"}

1142 Table elements in the specified section(s) with <a> tags will have their

1143 href extracted.

1144

1145 .. versionadded:: 1.5.0

1146

1147 Returns

1148 -------

1149 dfs

1150 A list of DataFrames.

1151

1152 See Also

1153 --------

1154 read_csv : Read a comma-separated values (csv) file into DataFrame.

1155

1156 Notes

1157 -----

1158 Before using this function you should read the :ref:`gotchas about the

1159 HTML parsing libraries <io.html.gotchas>`.

1160

1161 Expect to do some cleanup after you call this function. For example, you

1162 might need to manually assign column names if the column names are

1163 converted to NaN when you pass the `header=0` argument. We try to assume as

1164 little as possible about the structure of the table and push the

1165 idiosyncrasies of the HTML contained in the table to the user.

1166

1167 This function searches for ``<table>`` elements and only for ``<tr>``

1168 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``

1169 element in the table. ``<td>`` stands for "table data". This function

1170 attempts to properly handle ``colspan`` and ``rowspan`` attributes.

1171 If the function has a ``<thead>`` argument, it is used to construct

1172 the header, otherwise the function attempts to find the header within

1173 the body (by putting rows with only ``<th>`` elements into the header).

1174

1175 Similar to :func:`~read_csv` the `header` argument is applied

1176 **after** `skiprows` is applied.

1177

1178 This function will *always* return a list of :class:`DataFrame` *or*

1179 it will fail, e.g., it will *not* return an empty list.

1180

1181 Examples

1182 --------

1183 See the :ref:`read_html documentation in the IO section of the docs

1184 <io.read_html>` for some examples of reading in HTML tables.

1185 """

1186 _importers()

1187

1188 # Type check here. We don't want to parse only to fail because of an

1189 # invalid value of an integer skiprows.

1190 if isinstance(skiprows, numbers.Integral) and skiprows < 0:

1191 raise ValueError(

1192 "cannot skip rows starting from the end of the "

1193 "data (you passed a negative value)"

1194 )

1195 if extract_links not in [None, "header", "footer", "body", "all"]:

1196 raise ValueError(

1197 "`extract_links` must be one of "

1198 '{None, "header", "footer", "body", "all"}, got '

1199 f'"{extract_links}"'

1200 )

1201 validate_header_arg(header)

1202

1203 io = stringify_path(io)

1204

1205 return _parse(

1206 flavor=flavor,

1207 io=io,

1208 match=match,

1209 header=header,

1210 index_col=index_col,

1211 skiprows=skiprows,

1212 parse_dates=parse_dates,

1213 thousands=thousands,

1214 attrs=attrs,

1215 encoding=encoding,

1216 decimal=decimal,

1217 converters=converters,

1218 na_values=na_values,

1219 keep_default_na=keep_default_na,

1220 displayed_only=displayed_only,

1221 extract_links=extract_links,

1222 )