Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/html.py: 16%

366 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2:mod:`pandas.io.html` is a module containing functionality for dealing with 

3HTML IO. 

4 

5""" 

6 

7from __future__ import annotations 

8 

9from collections import abc 

10import numbers 

11import re 

12from typing import ( 

13 TYPE_CHECKING, 

14 Iterable, 

15 Literal, 

16 Pattern, 

17 Sequence, 

18 cast, 

19) 

20 

21from pandas._typing import ( 

22 FilePath, 

23 ReadBuffer, 

24) 

25from pandas.compat._optional import import_optional_dependency 

26from pandas.errors import ( 

27 AbstractMethodError, 

28 EmptyDataError, 

29) 

30from pandas.util._decorators import deprecate_nonkeyword_arguments 

31 

32from pandas.core.dtypes.common import is_list_like 

33 

34from pandas import isna 

35from pandas.core.construction import create_series_with_explicit_dtype 

36from pandas.core.indexes.base import Index 

37from pandas.core.indexes.multi import MultiIndex 

38 

39from pandas.io.common import ( 

40 file_exists, 

41 get_handle, 

42 is_url, 

43 stringify_path, 

44 urlopen, 

45 validate_header_arg, 

46) 

47from pandas.io.formats.printing import pprint_thing 

48from pandas.io.parsers import TextParser 

49 

50if TYPE_CHECKING: 50 ↛ 51line 50 didn't jump to line 51, because the condition on line 50 was never true

51 from pandas import DataFrame 

52 

53_IMPORTS = False 

54_HAS_BS4 = False 

55_HAS_LXML = False 

56_HAS_HTML5LIB = False 

57 

58 

59def _importers() -> None: 

60 # import things we need 

61 # but make this done on a first use basis 

62 

63 global _IMPORTS 

64 if _IMPORTS: 

65 return 

66 

67 global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB 

68 bs4 = import_optional_dependency("bs4", errors="ignore") 

69 _HAS_BS4 = bs4 is not None 

70 

71 lxml = import_optional_dependency("lxml.etree", errors="ignore") 

72 _HAS_LXML = lxml is not None 

73 

74 html5lib = import_optional_dependency("html5lib", errors="ignore") 

75 _HAS_HTML5LIB = html5lib is not None 

76 

77 _IMPORTS = True 

78 

79 

80############# 

81# READ HTML # 

82############# 

83_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") 

84 

85 

86def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str: 

87 """ 

88 Replace extra whitespace inside of a string with a single space. 

89 

90 Parameters 

91 ---------- 

92 s : str or unicode 

93 The string from which to remove extra whitespace. 

94 regex : re.Pattern 

95 The regular expression to use to remove extra whitespace. 

96 

97 Returns 

98 ------- 

99 subd : str or unicode 

100 `s` with all extra whitespace replaced with a single space. 

101 """ 

102 return regex.sub(" ", s.strip()) 

103 

104 

105def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]: 

106 """ 

107 Get an iterator given an integer, slice or container. 

108 

109 Parameters 

110 ---------- 

111 skiprows : int, slice, container 

112 The iterator to use to skip rows; can also be a slice. 

113 

114 Raises 

115 ------ 

116 TypeError 

117 * If `skiprows` is not a slice, integer, or Container 

118 

119 Returns 

120 ------- 

121 it : iterable 

122 A proper iterator to use to skip rows of a DataFrame. 

123 """ 

124 if isinstance(skiprows, slice): 

125 start, step = skiprows.start or 0, skiprows.step or 1 

126 return list(range(start, skiprows.stop, step)) 

127 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): 

128 return cast("int | Sequence[int]", skiprows) 

129 elif skiprows is None: 

130 return 0 

131 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") 

132 

133 

134def _read( 

135 obj: bytes | FilePath | ReadBuffer[str] | ReadBuffer[bytes], encoding: str | None 

136) -> str | bytes: 

137 """ 

138 Try to read from a url, file or string. 

139 

140 Parameters 

141 ---------- 

142 obj : str, unicode, path object, or file-like object 

143 

144 Returns 

145 ------- 

146 raw_text : str 

147 """ 

148 text: str | bytes 

149 if ( 

150 is_url(obj) 

151 or hasattr(obj, "read") 

152 or (isinstance(obj, str) and file_exists(obj)) 

153 ): 

154 # error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes, 

155 # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]"; 

156 # expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase, 

157 # BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]" 

158 with get_handle( 

159 obj, "r", encoding=encoding # type: ignore[arg-type] 

160 ) as handles: 

161 text = handles.handle.read() 

162 elif isinstance(obj, (str, bytes)): 

163 text = obj 

164 else: 

165 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") 

166 return text 

167 

168 

169class _HtmlFrameParser: 

170 """ 

171 Base class for parsers that parse HTML into DataFrames. 

172 

173 Parameters 

174 ---------- 

175 io : str or file-like 

176 This can be either a string of raw HTML, a valid URL using the HTTP, 

177 FTP, or FILE protocols or a file-like object. 

178 

179 match : str or regex 

180 The text to match in the document. 

181 

182 attrs : dict 

183 List of HTML <table> element attributes to match. 

184 

185 encoding : str 

186 Encoding to be used by parser 

187 

188 displayed_only : bool 

189 Whether or not items with "display:none" should be ignored 

190 

191 extract_links : {None, "all", "header", "body", "footer"} 

192 Table elements in the specified section(s) with <a> tags will have their 

193 href extracted. 

194 

195 .. versionadded:: 1.5.0 

196 

197 Attributes 

198 ---------- 

199 io : str or file-like 

200 raw HTML, URL, or file-like object 

201 

202 match : regex 

203 The text to match in the raw HTML 

204 

205 attrs : dict-like 

206 A dictionary of valid table attributes to use to search for table 

207 elements. 

208 

209 encoding : str 

210 Encoding to be used by parser 

211 

212 displayed_only : bool 

213 Whether or not items with "display:none" should be ignored 

214 

215 extract_links : {None, "all", "header", "body", "footer"} 

216 Table elements in the specified section(s) with <a> tags will have their 

217 href extracted. 

218 

219 .. versionadded:: 1.5.0 

220 

221 Notes 

222 ----- 

223 To subclass this class effectively you must override the following methods: 

224 * :func:`_build_doc` 

225 * :func:`_attr_getter` 

226 * :func:`_href_getter` 

227 * :func:`_text_getter` 

228 * :func:`_parse_td` 

229 * :func:`_parse_thead_tr` 

230 * :func:`_parse_tbody_tr` 

231 * :func:`_parse_tfoot_tr` 

232 * :func:`_parse_tables` 

233 * :func:`_equals_tag` 

234 See each method's respective documentation for details on their 

235 functionality. 

236 """ 

237 

238 def __init__( 

239 self, 

240 io: FilePath | ReadBuffer[str] | ReadBuffer[bytes], 

241 match: str | Pattern, 

242 attrs: dict[str, str] | None, 

243 encoding: str, 

244 displayed_only: bool, 

245 extract_links: Literal[None, "header", "footer", "body", "all"], 

246 ) -> None: 

247 self.io = io 

248 self.match = match 

249 self.attrs = attrs 

250 self.encoding = encoding 

251 self.displayed_only = displayed_only 

252 self.extract_links = extract_links 

253 

254 def parse_tables(self): 

255 """ 

256 Parse and return all tables from the DOM. 

257 

258 Returns 

259 ------- 

260 list of parsed (header, body, footer) tuples from tables. 

261 """ 

262 tables = self._parse_tables(self._build_doc(), self.match, self.attrs) 

263 return (self._parse_thead_tbody_tfoot(table) for table in tables) 

264 

265 def _attr_getter(self, obj, attr): 

266 """ 

267 Return the attribute value of an individual DOM node. 

268 

269 Parameters 

270 ---------- 

271 obj : node-like 

272 A DOM node. 

273 

274 attr : str or unicode 

275 The attribute, such as "colspan" 

276 

277 Returns 

278 ------- 

279 str or unicode 

280 The attribute value. 

281 """ 

282 # Both lxml and BeautifulSoup have the same implementation: 

283 return obj.get(attr) 

284 

285 def _href_getter(self, obj): 

286 """ 

287 Return a href if the DOM node contains a child <a> or None. 

288 

289 Parameters 

290 ---------- 

291 obj : node-like 

292 A DOM node. 

293 

294 Returns 

295 ------- 

296 href : str or unicode 

297 The href from the <a> child of the DOM node. 

298 """ 

299 raise AbstractMethodError(self) 

300 

301 def _text_getter(self, obj): 

302 """ 

303 Return the text of an individual DOM node. 

304 

305 Parameters 

306 ---------- 

307 obj : node-like 

308 A DOM node. 

309 

310 Returns 

311 ------- 

312 text : str or unicode 

313 The text from an individual DOM node. 

314 """ 

315 raise AbstractMethodError(self) 

316 

317 def _parse_td(self, obj): 

318 """ 

319 Return the td elements from a row element. 

320 

321 Parameters 

322 ---------- 

323 obj : node-like 

324 A DOM <tr> node. 

325 

326 Returns 

327 ------- 

328 list of node-like 

329 These are the elements of each row, i.e., the columns. 

330 """ 

331 raise AbstractMethodError(self) 

332 

333 def _parse_thead_tr(self, table): 

334 """ 

335 Return the list of thead row elements from the parsed table element. 

336 

337 Parameters 

338 ---------- 

339 table : a table element that contains zero or more thead elements. 

340 

341 Returns 

342 ------- 

343 list of node-like 

344 These are the <tr> row elements of a table. 

345 """ 

346 raise AbstractMethodError(self) 

347 

348 def _parse_tbody_tr(self, table): 

349 """ 

350 Return the list of tbody row elements from the parsed table element. 

351 

352 HTML5 table bodies consist of either 0 or more <tbody> elements (which 

353 only contain <tr> elements) or 0 or more <tr> elements. This method 

354 checks for both structures. 

355 

356 Parameters 

357 ---------- 

358 table : a table element that contains row elements. 

359 

360 Returns 

361 ------- 

362 list of node-like 

363 These are the <tr> row elements of a table. 

364 """ 

365 raise AbstractMethodError(self) 

366 

367 def _parse_tfoot_tr(self, table): 

368 """ 

369 Return the list of tfoot row elements from the parsed table element. 

370 

371 Parameters 

372 ---------- 

373 table : a table element that contains row elements. 

374 

375 Returns 

376 ------- 

377 list of node-like 

378 These are the <tr> row elements of a table. 

379 """ 

380 raise AbstractMethodError(self) 

381 

382 def _parse_tables(self, doc, match, attrs): 

383 """ 

384 Return all tables from the parsed DOM. 

385 

386 Parameters 

387 ---------- 

388 doc : the DOM from which to parse the table element. 

389 

390 match : str or regular expression 

391 The text to search for in the DOM tree. 

392 

393 attrs : dict 

394 A dictionary of table attributes that can be used to disambiguate 

395 multiple tables on a page. 

396 

397 Raises 

398 ------ 

399 ValueError : `match` does not match any text in the document. 

400 

401 Returns 

402 ------- 

403 list of node-like 

404 HTML <table> elements to be parsed into raw data. 

405 """ 

406 raise AbstractMethodError(self) 

407 

408 def _equals_tag(self, obj, tag): 

409 """ 

410 Return whether an individual DOM node matches a tag 

411 

412 Parameters 

413 ---------- 

414 obj : node-like 

415 A DOM node. 

416 

417 tag : str 

418 Tag name to be checked for equality. 

419 

420 Returns 

421 ------- 

422 boolean 

423 Whether `obj`'s tag name is `tag` 

424 """ 

425 raise AbstractMethodError(self) 

426 

427 def _build_doc(self): 

428 """ 

429 Return a tree-like object that can be used to iterate over the DOM. 

430 

431 Returns 

432 ------- 

433 node-like 

434 The DOM from which to parse the table element. 

435 """ 

436 raise AbstractMethodError(self) 

437 

438 def _parse_thead_tbody_tfoot(self, table_html): 

439 """ 

440 Given a table, return parsed header, body, and foot. 

441 

442 Parameters 

443 ---------- 

444 table_html : node-like 

445 

446 Returns 

447 ------- 

448 tuple of (header, body, footer), each a list of list-of-text rows. 

449 

450 Notes 

451 ----- 

452 Header and body are lists-of-lists. Top level list is a list of 

453 rows. Each row is a list of str text. 

454 

455 Logic: Use <thead>, <tbody>, <tfoot> elements to identify 

456 header, body, and footer, otherwise: 

457 - Put all rows into body 

458 - Move rows from top of body to header only if 

459 all elements inside row are <th> 

460 - Move rows from bottom of body to footer only if 

461 all elements inside row are <th> 

462 """ 

463 header_rows = self._parse_thead_tr(table_html) 

464 body_rows = self._parse_tbody_tr(table_html) 

465 footer_rows = self._parse_tfoot_tr(table_html) 

466 

467 def row_is_all_th(row): 

468 return all(self._equals_tag(t, "th") for t in self._parse_td(row)) 

469 

470 if not header_rows: 

471 # The table has no <thead>. Move the top all-<th> rows from 

472 # body_rows to header_rows. (This is a common case because many 

473 # tables in the wild have no <thead> or <tfoot> 

474 while body_rows and row_is_all_th(body_rows[0]): 

475 header_rows.append(body_rows.pop(0)) 

476 

477 header = self._expand_colspan_rowspan(header_rows, section="header") 

478 body = self._expand_colspan_rowspan(body_rows, section="body") 

479 footer = self._expand_colspan_rowspan(footer_rows, section="footer") 

480 

481 return header, body, footer 

482 

483 def _expand_colspan_rowspan( 

484 self, rows, section: Literal["header", "footer", "body"] 

485 ): 

486 """ 

487 Given a list of <tr>s, return a list of text rows. 

488 

489 Parameters 

490 ---------- 

491 rows : list of node-like 

492 List of <tr>s 

493 section : the section that the rows belong to (header, body or footer). 

494 

495 Returns 

496 ------- 

497 list of list 

498 Each returned row is a list of str text, or tuple (text, link) 

499 if extract_links is not None. 

500 

501 Notes 

502 ----- 

503 Any cell with ``rowspan`` or ``colspan`` will have its contents copied 

504 to subsequent cells. 

505 """ 

506 all_texts = [] # list of rows, each a list of str 

507 text: str | tuple 

508 remainder: list[ 

509 tuple[int, str | tuple, int] 

510 ] = [] # list of (index, text, nrows) 

511 

512 for tr in rows: 

513 texts = [] # the output for this row 

514 next_remainder = [] 

515 

516 index = 0 

517 tds = self._parse_td(tr) 

518 for td in tds: 

519 # Append texts from previous rows with rowspan>1 that come 

520 # before this <td> 

521 while remainder and remainder[0][0] <= index: 

522 prev_i, prev_text, prev_rowspan = remainder.pop(0) 

523 texts.append(prev_text) 

524 if prev_rowspan > 1: 

525 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

526 index += 1 

527 

528 # Append the text from this <td>, colspan times 

529 text = _remove_whitespace(self._text_getter(td)) 

530 if self.extract_links == "all" or self.extract_links == section: 

531 href = self._href_getter(td) 

532 text = (text, href) 

533 rowspan = int(self._attr_getter(td, "rowspan") or 1) 

534 colspan = int(self._attr_getter(td, "colspan") or 1) 

535 

536 for _ in range(colspan): 

537 texts.append(text) 

538 if rowspan > 1: 

539 next_remainder.append((index, text, rowspan - 1)) 

540 index += 1 

541 

542 # Append texts from previous rows at the final position 

543 for prev_i, prev_text, prev_rowspan in remainder: 

544 texts.append(prev_text) 

545 if prev_rowspan > 1: 

546 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

547 

548 all_texts.append(texts) 

549 remainder = next_remainder 

550 

551 # Append rows that only appear because the previous row had non-1 

552 # rowspan 

553 while remainder: 

554 next_remainder = [] 

555 texts = [] 

556 for prev_i, prev_text, prev_rowspan in remainder: 

557 texts.append(prev_text) 

558 if prev_rowspan > 1: 

559 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

560 all_texts.append(texts) 

561 remainder = next_remainder 

562 

563 return all_texts 

564 

565 def _handle_hidden_tables(self, tbl_list, attr_name): 

566 """ 

567 Return list of tables, potentially removing hidden elements 

568 

569 Parameters 

570 ---------- 

571 tbl_list : list of node-like 

572 Type of list elements will vary depending upon parser used 

573 attr_name : str 

574 Name of the accessor for retrieving HTML attributes 

575 

576 Returns 

577 ------- 

578 list of node-like 

579 Return type matches `tbl_list` 

580 """ 

581 if not self.displayed_only: 

582 return tbl_list 

583 

584 return [ 

585 x 

586 for x in tbl_list 

587 if "display:none" 

588 not in getattr(x, attr_name).get("style", "").replace(" ", "") 

589 ] 

590 

591 

592class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): 

593 """ 

594 HTML to DataFrame parser that uses BeautifulSoup under the hood. 

595 

596 See Also 

597 -------- 

598 pandas.io.html._HtmlFrameParser 

599 pandas.io.html._LxmlFrameParser 

600 

601 Notes 

602 ----- 

603 Documentation strings for this class are in the base class 

604 :class:`pandas.io.html._HtmlFrameParser`. 

605 """ 

606 

607 def __init__(self, *args, **kwargs) -> None: 

608 super().__init__(*args, **kwargs) 

609 from bs4 import SoupStrainer 

610 

611 self._strainer = SoupStrainer("table") 

612 

613 def _parse_tables(self, doc, match, attrs): 

614 element_name = self._strainer.name 

615 tables = doc.find_all(element_name, attrs=attrs) 

616 

617 if not tables: 

618 raise ValueError("No tables found") 

619 

620 result = [] 

621 unique_tables = set() 

622 tables = self._handle_hidden_tables(tables, "attrs") 

623 

624 for table in tables: 

625 if self.displayed_only: 

626 for elem in table.find_all(style=re.compile(r"display:\s*none")): 

627 elem.decompose() 

628 

629 if table not in unique_tables and table.find(string=match) is not None: 

630 result.append(table) 

631 unique_tables.add(table) 

632 

633 if not result: 

634 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") 

635 return result 

636 

637 def _href_getter(self, obj) -> str | None: 

638 a = obj.find("a", href=True) 

639 return None if not a else a["href"] 

640 

641 def _text_getter(self, obj): 

642 return obj.text 

643 

644 def _equals_tag(self, obj, tag): 

645 return obj.name == tag 

646 

647 def _parse_td(self, row): 

648 return row.find_all(("td", "th"), recursive=False) 

649 

650 def _parse_thead_tr(self, table): 

651 return table.select("thead tr") 

652 

653 def _parse_tbody_tr(self, table): 

654 from_tbody = table.select("tbody tr") 

655 from_root = table.find_all("tr", recursive=False) 

656 # HTML spec: at most one of these lists has content 

657 return from_tbody + from_root 

658 

659 def _parse_tfoot_tr(self, table): 

660 return table.select("tfoot tr") 

661 

662 def _setup_build_doc(self): 

663 raw_text = _read(self.io, self.encoding) 

664 if not raw_text: 

665 raise ValueError(f"No text parsed from document: {self.io}") 

666 return raw_text 

667 

668 def _build_doc(self): 

669 from bs4 import BeautifulSoup 

670 

671 bdoc = self._setup_build_doc() 

672 if isinstance(bdoc, bytes) and self.encoding is not None: 

673 udoc = bdoc.decode(self.encoding) 

674 from_encoding = None 

675 else: 

676 udoc = bdoc 

677 from_encoding = self.encoding 

678 

679 soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) 

680 

681 for br in soup.find_all("br"): 

682 br.replace_with("\n" + br.text) 

683 

684 return soup 

685 

686 

687def _build_xpath_expr(attrs) -> str: 

688 """ 

689 Build an xpath expression to simulate bs4's ability to pass in kwargs to 

690 search for attributes when using the lxml parser. 

691 

692 Parameters 

693 ---------- 

694 attrs : dict 

695 A dict of HTML attributes. These are NOT checked for validity. 

696 

697 Returns 

698 ------- 

699 expr : unicode 

700 An XPath expression that checks for the given HTML attributes. 

701 """ 

702 # give class attribute as class_ because class is a python keyword 

703 if "class_" in attrs: 

704 attrs["class"] = attrs.pop("class_") 

705 

706 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()]) 

707 return f"[{s}]" 

708 

709 

710_re_namespace = {"re": "http://exslt.org/regular-expressions"} 

711 

712 

713class _LxmlFrameParser(_HtmlFrameParser): 

714 """ 

715 HTML to DataFrame parser that uses lxml under the hood. 

716 

717 Warning 

718 ------- 

719 This parser can only handle HTTP, FTP, and FILE urls. 

720 

721 See Also 

722 -------- 

723 _HtmlFrameParser 

724 _BeautifulSoupLxmlFrameParser 

725 

726 Notes 

727 ----- 

728 Documentation strings for this class are in the base class 

729 :class:`_HtmlFrameParser`. 

730 """ 

731 

732 def _href_getter(self, obj) -> str | None: 

733 href = obj.xpath(".//a/@href") 

734 return None if not href else href[0] 

735 

736 def _text_getter(self, obj): 

737 return obj.text_content() 

738 

739 def _parse_td(self, row): 

740 # Look for direct children only: the "row" element here may be a 

741 # <thead> or <tfoot> (see _parse_thead_tr). 

742 return row.xpath("./td|./th") 

743 

744 def _parse_tables(self, doc, match, kwargs): 

745 pattern = match.pattern 

746 

747 # 1. check all descendants for the given pattern and only search tables 

748 # 2. go up the tree until we find a table 

749 xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table" 

750 

751 # if any table attributes were given build an xpath expression to 

752 # search for them 

753 if kwargs: 

754 xpath_expr += _build_xpath_expr(kwargs) 

755 

756 tables = doc.xpath(xpath_expr, namespaces=_re_namespace) 

757 

758 tables = self._handle_hidden_tables(tables, "attrib") 

759 if self.displayed_only: 

760 for table in tables: 

761 # lxml utilizes XPATH 1.0 which does not have regex 

762 # support. As a result, we find all elements with a style 

763 # attribute and iterate them to check for display:none 

764 for elem in table.xpath(".//*[@style]"): 

765 if "display:none" in elem.attrib.get("style", "").replace(" ", ""): 

766 elem.getparent().remove(elem) 

767 

768 if not tables: 

769 raise ValueError(f"No tables found matching regex {repr(pattern)}") 

770 return tables 

771 

772 def _equals_tag(self, obj, tag): 

773 return obj.tag == tag 

774 

775 def _build_doc(self): 

776 """ 

777 Raises 

778 ------ 

779 ValueError 

780 * If a URL that lxml cannot parse is passed. 

781 

782 Exception 

783 * Any other ``Exception`` thrown. For example, trying to parse a 

784 URL that is syntactically correct on a machine with no internet 

785 connection will fail. 

786 

787 See Also 

788 -------- 

789 pandas.io.html._HtmlFrameParser._build_doc 

790 """ 

791 from lxml.etree import XMLSyntaxError 

792 from lxml.html import ( 

793 HTMLParser, 

794 fromstring, 

795 parse, 

796 ) 

797 

798 parser = HTMLParser(recover=True, encoding=self.encoding) 

799 

800 try: 

801 if is_url(self.io): 

802 with urlopen(self.io) as f: 

803 r = parse(f, parser=parser) 

804 else: 

805 # try to parse the input in the simplest way 

806 r = parse(self.io, parser=parser) 

807 try: 

808 r = r.getroot() 

809 except AttributeError: 

810 pass 

811 except (UnicodeDecodeError, OSError) as e: 

812 # if the input is a blob of html goop 

813 if not is_url(self.io): 

814 r = fromstring(self.io, parser=parser) 

815 

816 try: 

817 r = r.getroot() 

818 except AttributeError: 

819 pass 

820 else: 

821 raise e 

822 else: 

823 if not hasattr(r, "text_content"): 

824 raise XMLSyntaxError("no text parsed from document", 0, 0, 0) 

825 

826 for br in r.xpath("*//br"): 

827 br.tail = "\n" + (br.tail or "") 

828 

829 return r 

830 

831 def _parse_thead_tr(self, table): 

832 rows = [] 

833 

834 for thead in table.xpath(".//thead"): 

835 rows.extend(thead.xpath("./tr")) 

836 

837 # HACK: lxml does not clean up the clearly-erroneous 

838 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add 

839 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its 

840 # children as though it's a <tr>. 

841 # 

842 # Better solution would be to use html5lib. 

843 elements_at_root = thead.xpath("./td|./th") 

844 if elements_at_root: 

845 rows.append(thead) 

846 

847 return rows 

848 

849 def _parse_tbody_tr(self, table): 

850 from_tbody = table.xpath(".//tbody//tr") 

851 from_root = table.xpath("./tr") 

852 # HTML spec: at most one of these lists has content 

853 return from_tbody + from_root 

854 

855 def _parse_tfoot_tr(self, table): 

856 return table.xpath(".//tfoot//tr") 

857 

858 

859def _expand_elements(body): 

860 data = [len(elem) for elem in body] 

861 lens = create_series_with_explicit_dtype(data, dtype_if_empty=object) 

862 lens_max = lens.max() 

863 not_max = lens[lens != lens_max] 

864 

865 empty = [""] 

866 for ind, length in not_max.items(): 

867 body[ind] += empty * (lens_max - length) 

868 

869 

870def _data_to_frame(**kwargs): 

871 head, body, foot = kwargs.pop("data") 

872 header = kwargs.pop("header") 

873 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) 

874 if head: 

875 body = head + body 

876 

877 # Infer header when there is a <thead> or top <th>-only rows 

878 if header is None: 

879 if len(head) == 1: 

880 header = 0 

881 else: 

882 # ignore all-empty-text rows 

883 header = [i for i, row in enumerate(head) if any(text for text in row)] 

884 

885 if foot: 

886 body += foot 

887 

888 # fill out elements of body that are "ragged" 

889 _expand_elements(body) 

890 with TextParser(body, header=header, **kwargs) as tp: 

891 return tp.read() 

892 

893 

894_valid_parsers = { 

895 "lxml": _LxmlFrameParser, 

896 None: _LxmlFrameParser, 

897 "html5lib": _BeautifulSoupHtml5LibFrameParser, 

898 "bs4": _BeautifulSoupHtml5LibFrameParser, 

899} 

900 

901 

902def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]: 

903 """ 

904 Choose the parser based on the input flavor. 

905 

906 Parameters 

907 ---------- 

908 flavor : str 

909 The type of parser to use. This must be a valid backend. 

910 

911 Returns 

912 ------- 

913 cls : _HtmlFrameParser subclass 

914 The parser class based on the requested input flavor. 

915 

916 Raises 

917 ------ 

918 ValueError 

919 * If `flavor` is not a valid backend. 

920 ImportError 

921 * If you do not have the requested `flavor` 

922 """ 

923 valid_parsers = list(_valid_parsers.keys()) 

924 if flavor not in valid_parsers: 

925 raise ValueError( 

926 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}" 

927 ) 

928 

929 if flavor in ("bs4", "html5lib"): 

930 if not _HAS_HTML5LIB: 

931 raise ImportError("html5lib not found, please install it") 

932 if not _HAS_BS4: 

933 raise ImportError("BeautifulSoup4 (bs4) not found, please install it") 

934 # Although we call this above, we want to raise here right before use. 

935 bs4 = import_optional_dependency("bs4") # noqa:F841 

936 

937 else: 

938 if not _HAS_LXML: 

939 raise ImportError("lxml not found, please install it") 

940 return _valid_parsers[flavor] 

941 

942 

943def _print_as_set(s) -> str: 

944 arg = ", ".join([pprint_thing(el) for el in s]) 

945 return f"{{{arg}}}" 

946 

947 

948def _validate_flavor(flavor): 

949 if flavor is None: 

950 flavor = "lxml", "bs4" 

951 elif isinstance(flavor, str): 

952 flavor = (flavor,) 

953 elif isinstance(flavor, abc.Iterable): 

954 if not all(isinstance(flav, str) for flav in flavor): 

955 raise TypeError( 

956 f"Object of type {repr(type(flavor).__name__)} " 

957 f"is not an iterable of strings" 

958 ) 

959 else: 

960 msg = repr(flavor) if isinstance(flavor, str) else str(flavor) 

961 msg += " is not a valid flavor" 

962 raise ValueError(msg) 

963 

964 flavor = tuple(flavor) 

965 valid_flavors = set(_valid_parsers) 

966 flavor_set = set(flavor) 

967 

968 if not flavor_set & valid_flavors: 

969 raise ValueError( 

970 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid " 

971 f"flavors are {_print_as_set(valid_flavors)}" 

972 ) 

973 return flavor 

974 

975 

976def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs): 

977 flavor = _validate_flavor(flavor) 

978 compiled_match = re.compile(match) # you can pass a compiled regex here 

979 

980 retained = None 

981 for flav in flavor: 

982 parser = _parser_dispatch(flav) 

983 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links) 

984 

985 try: 

986 tables = p.parse_tables() 

987 except ValueError as caught: 

988 # if `io` is an io-like object, check if it's seekable 

989 # and try to rewind it before trying the next parser 

990 if hasattr(io, "seekable") and io.seekable(): 

991 io.seek(0) 

992 elif hasattr(io, "seekable") and not io.seekable(): 

993 # if we couldn't rewind it, let the user know 

994 raise ValueError( 

995 f"The flavor {flav} failed to parse your input. " 

996 "Since you passed a non-rewindable file " 

997 "object, we can't rewind it to try " 

998 "another parser. Try read_html() with a different flavor." 

999 ) from caught 

1000 

1001 retained = caught 

1002 else: 

1003 break 

1004 else: 

1005 assert retained is not None # for mypy 

1006 raise retained 

1007 

1008 ret = [] 

1009 for table in tables: 

1010 try: 

1011 df = _data_to_frame(data=table, **kwargs) 

1012 # Cast MultiIndex header to an Index of tuples when extracting header 

1013 # links and replace nan with None (therefore can't use mi.to_flat_index()). 

1014 # This maintains consistency of selection (e.g. df.columns.str[1]) 

1015 if extract_links in ("all", "header") and isinstance( 

1016 df.columns, MultiIndex 

1017 ): 

1018 df.columns = Index( 

1019 ((col[0], None if isna(col[1]) else col[1]) for col in df.columns), 

1020 tupleize_cols=False, 

1021 ) 

1022 

1023 ret.append(df) 

1024 except EmptyDataError: # empty table 

1025 continue 

1026 return ret 

1027 

1028 

1029@deprecate_nonkeyword_arguments(version="2.0") 

1030def read_html( 

1031 io: FilePath | ReadBuffer[str], 

1032 match: str | Pattern = ".+", 

1033 flavor: str | None = None, 

1034 header: int | Sequence[int] | None = None, 

1035 index_col: int | Sequence[int] | None = None, 

1036 skiprows: int | Sequence[int] | slice | None = None, 

1037 attrs: dict[str, str] | None = None, 

1038 parse_dates: bool = False, 

1039 thousands: str | None = ",", 

1040 encoding: str | None = None, 

1041 decimal: str = ".", 

1042 converters: dict | None = None, 

1043 na_values: Iterable[object] | None = None, 

1044 keep_default_na: bool = True, 

1045 displayed_only: bool = True, 

1046 extract_links: Literal[None, "header", "footer", "body", "all"] = None, 

1047) -> list[DataFrame]: 

1048 r""" 

1049 Read HTML tables into a ``list`` of ``DataFrame`` objects. 

1050 

1051 Parameters 

1052 ---------- 

1053 io : str, path object, or file-like object 

1054 String, path object (implementing ``os.PathLike[str]``), or file-like 

1055 object implementing a string ``read()`` function. 

1056 The string can represent a URL or the HTML itself. Note that 

1057 lxml only accepts the http, ftp and file url protocols. If you have a 

1058 URL that starts with ``'https'`` you might try removing the ``'s'``. 

1059 

1060 match : str or compiled regular expression, optional 

1061 The set of tables containing text matching this regex or string will be 

1062 returned. Unless the HTML is extremely simple you will probably need to 

1063 pass a non-empty string here. Defaults to '.+' (match any non-empty 

1064 string). The default value will return all tables contained on a page. 

1065 This value is converted to a regular expression so that there is 

1066 consistent behavior between Beautiful Soup and lxml. 

1067 

1068 flavor : str, optional 

1069 The parsing engine to use. 'bs4' and 'html5lib' are synonymous with 

1070 each other, they are both there for backwards compatibility. The 

1071 default of ``None`` tries to use ``lxml`` to parse and if that fails it 

1072 falls back on ``bs4`` + ``html5lib``. 

1073 

1074 header : int or list-like, optional 

1075 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to 

1076 make the columns headers. 

1077 

1078 index_col : int or list-like, optional 

1079 The column (or list of columns) to use to create the index. 

1080 

1081 skiprows : int, list-like or slice, optional 

1082 Number of rows to skip after parsing the column integer. 0-based. If a 

1083 sequence of integers or a slice is given, will skip the rows indexed by 

1084 that sequence. Note that a single element sequence means 'skip the nth 

1085 row' whereas an integer means 'skip n rows'. 

1086 

1087 attrs : dict, optional 

1088 This is a dictionary of attributes that you can pass to use to identify 

1089 the table in the HTML. These are not checked for validity before being 

1090 passed to lxml or Beautiful Soup. However, these attributes must be 

1091 valid HTML table attributes to work correctly. For example, :: 

1092 

1093 attrs = {'id': 'table'} 

1094 

1095 is a valid attribute dictionary because the 'id' HTML tag attribute is 

1096 a valid HTML attribute for *any* HTML tag as per `this document 

1097 <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. :: 

1098 

1099 attrs = {'asdf': 'table'} 

1100 

1101 is *not* a valid attribute dictionary because 'asdf' is not a valid 

1102 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 

1103 table attributes can be found `here 

1104 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A 

1105 working draft of the HTML 5 spec can be found `here 

1106 <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the 

1107 latest information on table attributes for the modern web. 

1108 

1109 parse_dates : bool, optional 

1110 See :func:`~read_csv` for more details. 

1111 

1112 thousands : str, optional 

1113 Separator to use to parse thousands. Defaults to ``','``. 

1114 

1115 encoding : str, optional 

1116 The encoding used to decode the web page. Defaults to ``None``.``None`` 

1117 preserves the previous encoding behavior, which depends on the 

1118 underlying parser library (e.g., the parser library will try to use 

1119 the encoding provided by the document). 

1120 

1121 decimal : str, default '.' 

1122 Character to recognize as decimal point (e.g. use ',' for European 

1123 data). 

1124 

1125 converters : dict, default None 

1126 Dict of functions for converting values in certain columns. Keys can 

1127 either be integers or column labels, values are functions that take one 

1128 input argument, the cell (not column) content, and return the 

1129 transformed content. 

1130 

1131 na_values : iterable, default None 

1132 Custom NA values. 

1133 

1134 keep_default_na : bool, default True 

1135 If na_values are specified and keep_default_na is False the default NaN 

1136 values are overridden, otherwise they're appended to. 

1137 

1138 displayed_only : bool, default True 

1139 Whether elements with "display: none" should be parsed. 

1140 

1141 extract_links : {None, "all", "header", "body", "footer"} 

1142 Table elements in the specified section(s) with <a> tags will have their 

1143 href extracted. 

1144 

1145 .. versionadded:: 1.5.0 

1146 

1147 Returns 

1148 ------- 

1149 dfs 

1150 A list of DataFrames. 

1151 

1152 See Also 

1153 -------- 

1154 read_csv : Read a comma-separated values (csv) file into DataFrame. 

1155 

1156 Notes 

1157 ----- 

1158 Before using this function you should read the :ref:`gotchas about the 

1159 HTML parsing libraries <io.html.gotchas>`. 

1160 

1161 Expect to do some cleanup after you call this function. For example, you 

1162 might need to manually assign column names if the column names are 

1163 converted to NaN when you pass the `header=0` argument. We try to assume as 

1164 little as possible about the structure of the table and push the 

1165 idiosyncrasies of the HTML contained in the table to the user. 

1166 

1167 This function searches for ``<table>`` elements and only for ``<tr>`` 

1168 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>`` 

1169 element in the table. ``<td>`` stands for "table data". This function 

1170 attempts to properly handle ``colspan`` and ``rowspan`` attributes. 

1171 If the function has a ``<thead>`` argument, it is used to construct 

1172 the header, otherwise the function attempts to find the header within 

1173 the body (by putting rows with only ``<th>`` elements into the header). 

1174 

1175 Similar to :func:`~read_csv` the `header` argument is applied 

1176 **after** `skiprows` is applied. 

1177 

1178 This function will *always* return a list of :class:`DataFrame` *or* 

1179 it will fail, e.g., it will *not* return an empty list. 

1180 

1181 Examples 

1182 -------- 

1183 See the :ref:`read_html documentation in the IO section of the docs 

1184 <io.read_html>` for some examples of reading in HTML tables. 

1185 """ 

1186 _importers() 

1187 

1188 # Type check here. We don't want to parse only to fail because of an 

1189 # invalid value of an integer skiprows. 

1190 if isinstance(skiprows, numbers.Integral) and skiprows < 0: 

1191 raise ValueError( 

1192 "cannot skip rows starting from the end of the " 

1193 "data (you passed a negative value)" 

1194 ) 

1195 if extract_links not in [None, "header", "footer", "body", "all"]: 

1196 raise ValueError( 

1197 "`extract_links` must be one of " 

1198 '{None, "header", "footer", "body", "all"}, got ' 

1199 f'"{extract_links}"' 

1200 ) 

1201 validate_header_arg(header) 

1202 

1203 io = stringify_path(io) 

1204 

1205 return _parse( 

1206 flavor=flavor, 

1207 io=io, 

1208 match=match, 

1209 header=header, 

1210 index_col=index_col, 

1211 skiprows=skiprows, 

1212 parse_dates=parse_dates, 

1213 thousands=thousands, 

1214 attrs=attrs, 

1215 encoding=encoding, 

1216 decimal=decimal, 

1217 converters=converters, 

1218 na_values=na_values, 

1219 keep_default_na=keep_default_na, 

1220 displayed_only=displayed_only, 

1221 extract_links=extract_links, 

1222 )