Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/xml.py: 13%

226 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2:mod:`pandas.io.xml` is a module for reading XML. 

3""" 

4 

5from __future__ import annotations 

6 

7import io 

8from typing import ( 

9 Any, 

10 Callable, 

11 Sequence, 

12) 

13 

14from pandas._typing import ( 

15 TYPE_CHECKING, 

16 CompressionOptions, 

17 ConvertersArg, 

18 DtypeArg, 

19 FilePath, 

20 ParseDatesArg, 

21 ReadBuffer, 

22 StorageOptions, 

23 XMLParsers, 

24) 

25from pandas.compat._optional import import_optional_dependency 

26from pandas.errors import ( 

27 AbstractMethodError, 

28 ParserError, 

29) 

30from pandas.util._decorators import ( 

31 deprecate_nonkeyword_arguments, 

32 doc, 

33) 

34 

35from pandas.core.dtypes.common import is_list_like 

36 

37from pandas.core.shared_docs import _shared_docs 

38 

39from pandas.io.common import ( 

40 file_exists, 

41 get_handle, 

42 infer_compression, 

43 is_fsspec_url, 

44 is_url, 

45 stringify_path, 

46) 

47from pandas.io.parsers import TextParser 

48 

49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true

50 from xml.etree.ElementTree import Element 

51 

52 from lxml.etree import ( 

53 _Element, 

54 _XSLTResultTree, 

55 ) 

56 

57 from pandas import DataFrame 

58 

59 

60@doc( 

61 storage_options=_shared_docs["storage_options"], 

62 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", 

63) 

64class _XMLFrameParser: 

65 """ 

66 Internal subclass to parse XML into DataFrames. 

67 

68 Parameters 

69 ---------- 

70 path_or_buffer : a valid JSON str, path object or file-like object 

71 Any valid string path is acceptable. The string could be a URL. Valid 

72 URL schemes include http, ftp, s3, and file. 

73 

74 xpath : str or regex 

75 The XPath expression to parse required set of nodes for 

76 migration to `Data Frame`. `etree` supports limited XPath. 

77 

78 namespaces : dict 

79 The namespaces defined in XML document (`xmlns:namespace='URI') 

80 as dicts with key being namespace and value the URI. 

81 

82 elems_only : bool 

83 Parse only the child elements at the specified `xpath`. 

84 

85 attrs_only : bool 

86 Parse only the attributes at the specified `xpath`. 

87 

88 names : list 

89 Column names for Data Frame of parsed XML data. 

90 

91 dtype : dict 

92 Data type for data or columns. E.g. {{'a': np.float64, 

93 'b': np.int32, 'c': 'Int64'}} 

94 

95 .. versionadded:: 1.5.0 

96 

97 converters : dict, optional 

98 Dict of functions for converting values in certain columns. Keys can 

99 either be integers or column labels. 

100 

101 .. versionadded:: 1.5.0 

102 

103 parse_dates : bool or list of int or names or list of lists or dict 

104 Converts either index or select columns to datetimes 

105 

106 .. versionadded:: 1.5.0 

107 

108 encoding : str 

109 Encoding of xml object or document. 

110 

111 stylesheet : str or file-like 

112 URL, file, file-like object, or a raw string containing XSLT, 

113 `etree` does not support XSLT but retained for consistency. 

114 

115 iterparse : dict, optional 

116 Dict with row element as key and list of descendant elements 

117 and/or attributes as value to be retrieved in iterparsing of 

118 XML document. 

119 

120 .. versionadded:: 1.5.0 

121 

122 {decompression_options} 

123 

124 .. versionchanged:: 1.4.0 Zstandard support. 

125 

126 {storage_options} 

127 

128 See also 

129 -------- 

130 pandas.io.xml._EtreeFrameParser 

131 pandas.io.xml._LxmlFrameParser 

132 

133 Notes 

134 ----- 

135 To subclass this class effectively you must override the following methods:` 

136 * :func:`parse_data` 

137 * :func:`_parse_nodes` 

138 * :func:`_iterparse_nodes` 

139 * :func:`_parse_doc` 

140 * :func:`_validate_names` 

141 * :func:`_validate_path` 

142 

143 

144 See each method's respective documentation for details on their 

145 functionality. 

146 """ 

147 

148 def __init__( 

149 self, 

150 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], 

151 xpath: str, 

152 namespaces: dict[str, str] | None, 

153 elems_only: bool, 

154 attrs_only: bool, 

155 names: Sequence[str] | None, 

156 dtype: DtypeArg | None, 

157 converters: ConvertersArg | None, 

158 parse_dates: ParseDatesArg | None, 

159 encoding: str | None, 

160 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, 

161 iterparse: dict[str, list[str]] | None, 

162 compression: CompressionOptions, 

163 storage_options: StorageOptions, 

164 ) -> None: 

165 self.path_or_buffer = path_or_buffer 

166 self.xpath = xpath 

167 self.namespaces = namespaces 

168 self.elems_only = elems_only 

169 self.attrs_only = attrs_only 

170 self.names = names 

171 self.dtype = dtype 

172 self.converters = converters 

173 self.parse_dates = parse_dates 

174 self.encoding = encoding 

175 self.stylesheet = stylesheet 

176 self.iterparse = iterparse 

177 self.is_style = None 

178 self.compression = compression 

179 self.storage_options = storage_options 

180 

181 def parse_data(self) -> list[dict[str, str | None]]: 

182 """ 

183 Parse xml data. 

184 

185 This method will call the other internal methods to 

186 validate xpath, names, parse and return specific nodes. 

187 """ 

188 

189 raise AbstractMethodError(self) 

190 

191 def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]: 

192 """ 

193 Parse xml nodes. 

194 

195 This method will parse the children and attributes of elements 

196 in xpath, conditionally for only elements, only attributes 

197 or both while optionally renaming node names. 

198 

199 Raises 

200 ------ 

201 ValueError 

202 * If only elements and only attributes are specified. 

203 

204 Notes 

205 ----- 

206 Namespace URIs will be removed from return node values. Also, 

207 elements with missing children or attributes compared to siblings 

208 will have optional keys filled with None values. 

209 """ 

210 

211 dicts: list[dict[str, str | None]] 

212 

213 if self.elems_only and self.attrs_only: 

214 raise ValueError("Either element or attributes can be parsed not both.") 

215 elif self.elems_only: 

216 if self.names: 

217 dicts = [ 

218 { 

219 **( 

220 {el.tag: el.text.strip()} 

221 if el.text and not el.text.isspace() 

222 else {} 

223 ), 

224 **{ 

225 nm: ch.text.strip() if ch.text else None 

226 for nm, ch in zip(self.names, el.findall("*")) 

227 }, 

228 } 

229 for el in elems 

230 ] 

231 else: 

232 dicts = [ 

233 { 

234 ch.tag: ch.text.strip() if ch.text else None 

235 for ch in el.findall("*") 

236 } 

237 for el in elems 

238 ] 

239 

240 elif self.attrs_only: 

241 dicts = [ 

242 {k: v.strip() if v else None for k, v in el.attrib.items()} 

243 for el in elems 

244 ] 

245 

246 else: 

247 if self.names: 

248 dicts = [ 

249 { 

250 **el.attrib, 

251 **( 

252 {el.tag: el.text.strip()} 

253 if el.text and not el.text.isspace() 

254 else {} 

255 ), 

256 **{ 

257 nm: ch.text.strip() if ch.text else None 

258 for nm, ch in zip(self.names, el.findall("*")) 

259 }, 

260 } 

261 for el in elems 

262 ] 

263 

264 else: 

265 dicts = [ 

266 { 

267 **el.attrib, 

268 **( 

269 {el.tag: el.text.strip()} 

270 if el.text and not el.text.isspace() 

271 else {} 

272 ), 

273 **{ 

274 ch.tag: ch.text.strip() if ch.text else None 

275 for ch in el.findall("*") 

276 }, 

277 } 

278 for el in elems 

279 ] 

280 

281 dicts = [ 

282 {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts 

283 ] 

284 

285 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) 

286 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] 

287 

288 if self.names: 

289 dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] 

290 

291 return dicts 

292 

293 def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]: 

294 """ 

295 Iterparse xml nodes. 

296 

297 This method will read in local disk, decompressed XML files for elements 

298 and underlying descendants using iterparse, a method to iterate through 

299 an XML tree without holding entire XML tree in memory. 

300 

301 Raises 

302 ------ 

303 TypeError 

304 * If `iterparse` is not a dict or its dict value is not list-like. 

305 ParserError 

306 * If `path_or_buffer` is not a physical, decompressed file on disk. 

307 * If no data is returned from selected items in `iterparse`. 

308 

309 Notes 

310 ----- 

311 Namespace URIs will be removed from return node values. Also, 

312 elements with missing children or attributes in submitted list 

313 will have optional keys filled with None values. 

314 """ 

315 

316 dicts: list[dict[str, str | None]] = [] 

317 row: dict[str, str | None] | None = None 

318 

319 if not isinstance(self.iterparse, dict): 

320 raise TypeError( 

321 f"{type(self.iterparse).__name__} is not a valid type for iterparse" 

322 ) 

323 

324 row_node = next(iter(self.iterparse.keys())) if self.iterparse else "" 

325 if not is_list_like(self.iterparse[row_node]): 

326 raise TypeError( 

327 f"{type(self.iterparse[row_node])} is not a valid type " 

328 "for value in iterparse" 

329 ) 

330 

331 if ( 

332 not isinstance(self.path_or_buffer, str) 

333 or is_url(self.path_or_buffer) 

334 or is_fsspec_url(self.path_or_buffer) 

335 or self.path_or_buffer.startswith(("<?xml", "<")) 

336 or infer_compression(self.path_or_buffer, "infer") is not None 

337 ): 

338 raise ParserError( 

339 "iterparse is designed for large XML files that are fully extracted on " 

340 "local disk and not as compressed files or online sources." 

341 ) 

342 

343 for event, elem in iterparse(self.path_or_buffer, events=("start", "end")): 

344 curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag 

345 

346 if event == "start": 

347 if curr_elem == row_node: 

348 row = {} 

349 

350 if row is not None: 

351 if self.names: 

352 for col, nm in zip(self.iterparse[row_node], self.names): 

353 if curr_elem == col: 

354 elem_val = elem.text.strip() if elem.text else None 

355 if row.get(nm) != elem_val and nm not in row: 

356 row[nm] = elem_val 

357 if col in elem.attrib: 

358 if elem.attrib[col] not in row.values() and nm not in row: 

359 row[nm] = elem.attrib[col] 

360 else: 

361 for col in self.iterparse[row_node]: 

362 if curr_elem == col: 

363 row[col] = elem.text.strip() if elem.text else None 

364 if col in elem.attrib: 

365 row[col] = elem.attrib[col] 

366 

367 if event == "end": 

368 if curr_elem == row_node and row is not None: 

369 dicts.append(row) 

370 row = None 

371 

372 elem.clear() 

373 if hasattr(elem, "getprevious"): 

374 while ( 

375 elem.getprevious() is not None and elem.getparent() is not None 

376 ): 

377 del elem.getparent()[0] 

378 

379 if dicts == []: 

380 raise ParserError("No result from selected items in iterparse.") 

381 

382 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()])) 

383 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts] 

384 

385 if self.names: 

386 dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts] 

387 

388 return dicts 

389 

390 def _validate_path(self) -> None: 

391 """ 

392 Validate xpath. 

393 

394 This method checks for syntax, evaluation, or empty nodes return. 

395 

396 Raises 

397 ------ 

398 SyntaxError 

399 * If xpah is not supported or issues with namespaces. 

400 

401 ValueError 

402 * If xpah does not return any nodes. 

403 """ 

404 

405 raise AbstractMethodError(self) 

406 

407 def _validate_names(self) -> None: 

408 """ 

409 Validate names. 

410 

411 This method will check if names is a list-like and aligns 

412 with length of parse nodes. 

413 

414 Raises 

415 ------ 

416 ValueError 

417 * If value is not a list and less then length of nodes. 

418 """ 

419 raise AbstractMethodError(self) 

420 

421 def _parse_doc( 

422 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] 

423 ) -> Element | _Element: 

424 """ 

425 Build tree from path_or_buffer. 

426 

427 This method will parse XML object into tree 

428 either from string/bytes or file location. 

429 """ 

430 raise AbstractMethodError(self) 

431 

432 

433class _EtreeFrameParser(_XMLFrameParser): 

434 """ 

435 Internal class to parse XML into DataFrames with the Python 

436 standard library XML module: `xml.etree.ElementTree`. 

437 """ 

438 

439 def parse_data(self) -> list[dict[str, str | None]]: 

440 from xml.etree.ElementTree import iterparse 

441 

442 if self.stylesheet is not None: 

443 raise ValueError( 

444 "To use stylesheet, you need lxml installed and selected as parser." 

445 ) 

446 

447 if self.iterparse is None: 

448 self.xml_doc = self._parse_doc(self.path_or_buffer) 

449 self._validate_path() 

450 elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces) 

451 

452 self._validate_names() 

453 

454 xml_dicts: list[dict[str, str | None]] = ( 

455 self._parse_nodes(elems) 

456 if self.iterparse is None 

457 else self._iterparse_nodes(iterparse) 

458 ) 

459 

460 return xml_dicts 

461 

462 def _validate_path(self) -> None: 

463 """ 

464 Notes 

465 ----- 

466 `etree` supports limited XPath. If user attempts a more complex 

467 expression syntax error will raise. 

468 """ 

469 

470 msg = ( 

471 "xpath does not return any nodes. " 

472 "If document uses namespaces denoted with " 

473 "xmlns, be sure to define namespaces and " 

474 "use them in xpath." 

475 ) 

476 try: 

477 elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces) 

478 if elems is None: 

479 raise ValueError(msg) 

480 

481 if elems is not None and elems.find("*") is None and elems.attrib is None: 

482 raise ValueError(msg) 

483 

484 except (KeyError, SyntaxError): 

485 raise SyntaxError( 

486 "You have used an incorrect or unsupported XPath " 

487 "expression for etree library or you used an " 

488 "undeclared namespace prefix." 

489 ) 

490 

491 def _validate_names(self) -> None: 

492 children: list[Any] 

493 

494 if self.names: 

495 if self.iterparse: 

496 children = self.iterparse[next(iter(self.iterparse))] 

497 else: 

498 parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces) 

499 children = parent.findall("*") if parent else [] 

500 

501 if is_list_like(self.names): 

502 if len(self.names) < len(children): 

503 raise ValueError( 

504 "names does not match length of child elements in xpath." 

505 ) 

506 else: 

507 raise TypeError( 

508 f"{type(self.names).__name__} is not a valid type for names" 

509 ) 

510 

511 def _parse_doc( 

512 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] 

513 ) -> Element: 

514 from xml.etree.ElementTree import ( 

515 XMLParser, 

516 parse, 

517 ) 

518 

519 handle_data = get_data_from_filepath( 

520 filepath_or_buffer=raw_doc, 

521 encoding=self.encoding, 

522 compression=self.compression, 

523 storage_options=self.storage_options, 

524 ) 

525 

526 with preprocess_data(handle_data) as xml_data: 

527 curr_parser = XMLParser(encoding=self.encoding) 

528 doc = parse(xml_data, parser=curr_parser) 

529 

530 return doc.getroot() 

531 

532 

533class _LxmlFrameParser(_XMLFrameParser): 

534 """ 

535 Internal class to parse XML into DataFrames with third-party 

536 full-featured XML library, `lxml`, that supports 

537 XPath 1.0 and XSLT 1.0. 

538 """ 

539 

540 def parse_data(self) -> list[dict[str, str | None]]: 

541 """ 

542 Parse xml data. 

543 

544 This method will call the other internal methods to 

545 validate xpath, names, optionally parse and run XSLT, 

546 and parse original or transformed XML and return specific nodes. 

547 """ 

548 from lxml.etree import iterparse 

549 

550 if self.iterparse is None: 

551 self.xml_doc = self._parse_doc(self.path_or_buffer) 

552 

553 if self.stylesheet: 

554 self.xsl_doc = self._parse_doc(self.stylesheet) 

555 self.xml_doc = self._transform_doc() 

556 

557 self._validate_path() 

558 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) 

559 

560 self._validate_names() 

561 

562 xml_dicts: list[dict[str, str | None]] = ( 

563 self._parse_nodes(elems) 

564 if self.iterparse is None 

565 else self._iterparse_nodes(iterparse) 

566 ) 

567 

568 return xml_dicts 

569 

570 def _validate_path(self) -> None: 

571 

572 msg = ( 

573 "xpath does not return any nodes. " 

574 "Be sure row level nodes are in xpath. " 

575 "If document uses namespaces denoted with " 

576 "xmlns, be sure to define namespaces and " 

577 "use them in xpath." 

578 ) 

579 

580 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces) 

581 children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces) 

582 attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces) 

583 

584 if elems == []: 

585 raise ValueError(msg) 

586 

587 if elems != [] and attrs == [] and children == []: 

588 raise ValueError(msg) 

589 

590 def _validate_names(self) -> None: 

591 children: list[Any] 

592 

593 if self.names: 

594 if self.iterparse: 

595 children = self.iterparse[next(iter(self.iterparse))] 

596 else: 

597 children = self.xml_doc.xpath( 

598 self.xpath + "[1]/*", namespaces=self.namespaces 

599 ) 

600 

601 if is_list_like(self.names): 

602 if len(self.names) < len(children): 

603 raise ValueError( 

604 "names does not match length of child elements in xpath." 

605 ) 

606 else: 

607 raise TypeError( 

608 f"{type(self.names).__name__} is not a valid type for names" 

609 ) 

610 

611 def _parse_doc( 

612 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str] 

613 ) -> _Element: 

614 from lxml.etree import ( 

615 XMLParser, 

616 fromstring, 

617 parse, 

618 ) 

619 

620 handle_data = get_data_from_filepath( 

621 filepath_or_buffer=raw_doc, 

622 encoding=self.encoding, 

623 compression=self.compression, 

624 storage_options=self.storage_options, 

625 ) 

626 

627 with preprocess_data(handle_data) as xml_data: 

628 curr_parser = XMLParser(encoding=self.encoding) 

629 

630 if isinstance(xml_data, io.StringIO): 

631 if self.encoding is None: 

632 raise TypeError( 

633 "Can not pass encoding None when input is StringIO." 

634 ) 

635 

636 doc = fromstring( 

637 xml_data.getvalue().encode(self.encoding), parser=curr_parser 

638 ) 

639 else: 

640 doc = parse(xml_data, parser=curr_parser) 

641 

642 return doc 

643 

644 def _transform_doc(self) -> _XSLTResultTree: 

645 """ 

646 Transform original tree using stylesheet. 

647 

648 This method will transform original xml using XSLT script into 

649 am ideally flatter xml document for easier parsing and migration 

650 to Data Frame. 

651 """ 

652 from lxml.etree import XSLT 

653 

654 transformer = XSLT(self.xsl_doc) 

655 new_doc = transformer(self.xml_doc) 

656 

657 return new_doc 

658 

659 

660def get_data_from_filepath( 

661 filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str], 

662 encoding: str | None, 

663 compression: CompressionOptions, 

664 storage_options: StorageOptions, 

665) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]: 

666 """ 

667 Extract raw XML data. 

668 

669 The method accepts three input types: 

670 1. filepath (string-like) 

671 2. file-like object (e.g. open file object, StringIO) 

672 3. XML string or bytes 

673 

674 This method turns (1) into (2) to simplify the rest of the processing. 

675 It returns input types (2) and (3) unchanged. 

676 """ 

677 if not isinstance(filepath_or_buffer, bytes): 

678 filepath_or_buffer = stringify_path(filepath_or_buffer) 

679 

680 if ( 

681 isinstance(filepath_or_buffer, str) 

682 and not filepath_or_buffer.startswith(("<?xml", "<")) 

683 ) and ( 

684 not isinstance(filepath_or_buffer, str) 

685 or is_url(filepath_or_buffer) 

686 or is_fsspec_url(filepath_or_buffer) 

687 or file_exists(filepath_or_buffer) 

688 ): 

689 with get_handle( 

690 filepath_or_buffer, 

691 "r", 

692 encoding=encoding, 

693 compression=compression, 

694 storage_options=storage_options, 

695 ) as handle_obj: 

696 filepath_or_buffer = ( 

697 # error: Incompatible types in assignment (expression has type 

698 # "Union[str, IO[str]]", variable has type "Union[Union[str, 

699 # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]") 

700 handle_obj.handle.read() # type: ignore[assignment] 

701 if hasattr(handle_obj.handle, "read") 

702 else handle_obj.handle 

703 ) 

704 

705 return filepath_or_buffer 

706 

707 

708def preprocess_data(data) -> io.StringIO | io.BytesIO: 

709 """ 

710 Convert extracted raw data. 

711 

712 This method will return underlying data of extracted XML content. 

713 The data either has a `read` attribute (e.g. a file object or a 

714 StringIO/BytesIO) or is a string or bytes that is an XML document. 

715 """ 

716 

717 if isinstance(data, str): 

718 data = io.StringIO(data) 

719 

720 elif isinstance(data, bytes): 

721 data = io.BytesIO(data) 

722 

723 return data 

724 

725 

726def _data_to_frame(data, **kwargs) -> DataFrame: 

727 """ 

728 Convert parsed data to Data Frame. 

729 

730 This method will bind xml dictionary data of keys and values 

731 into named columns of Data Frame using the built-in TextParser 

732 class that build Data Frame and infers specific dtypes. 

733 """ 

734 

735 tags = next(iter(data)) 

736 nodes = [list(d.values()) for d in data] 

737 

738 try: 

739 with TextParser(nodes, names=tags, **kwargs) as tp: 

740 return tp.read() 

741 except ParserError: 

742 raise ParserError( 

743 "XML document may be too complex for import. " 

744 "Try to flatten document and use distinct " 

745 "element and attribute names." 

746 ) 

747 

748 

749def _parse( 

750 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], 

751 xpath: str, 

752 namespaces: dict[str, str] | None, 

753 elems_only: bool, 

754 attrs_only: bool, 

755 names: Sequence[str] | None, 

756 dtype: DtypeArg | None, 

757 converters: ConvertersArg | None, 

758 parse_dates: ParseDatesArg | None, 

759 encoding: str | None, 

760 parser: XMLParsers, 

761 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None, 

762 iterparse: dict[str, list[str]] | None, 

763 compression: CompressionOptions, 

764 storage_options: StorageOptions, 

765 **kwargs, 

766) -> DataFrame: 

767 """ 

768 Call internal parsers. 

769 

770 This method will conditionally call internal parsers: 

771 LxmlFrameParser and/or EtreeParser. 

772 

773 Raises 

774 ------ 

775 ImportError 

776 * If lxml is not installed if selected as parser. 

777 

778 ValueError 

779 * If parser is not lxml or etree. 

780 """ 

781 

782 p: _EtreeFrameParser | _LxmlFrameParser 

783 

784 if parser == "lxml": 

785 lxml = import_optional_dependency("lxml.etree", errors="ignore") 

786 

787 if lxml is not None: 

788 p = _LxmlFrameParser( 

789 path_or_buffer, 

790 xpath, 

791 namespaces, 

792 elems_only, 

793 attrs_only, 

794 names, 

795 dtype, 

796 converters, 

797 parse_dates, 

798 encoding, 

799 stylesheet, 

800 iterparse, 

801 compression, 

802 storage_options, 

803 ) 

804 else: 

805 raise ImportError("lxml not found, please install or use the etree parser.") 

806 

807 elif parser == "etree": 

808 p = _EtreeFrameParser( 

809 path_or_buffer, 

810 xpath, 

811 namespaces, 

812 elems_only, 

813 attrs_only, 

814 names, 

815 dtype, 

816 converters, 

817 parse_dates, 

818 encoding, 

819 stylesheet, 

820 iterparse, 

821 compression, 

822 storage_options, 

823 ) 

824 else: 

825 raise ValueError("Values for parser can only be lxml or etree.") 

826 

827 data_dicts = p.parse_data() 

828 

829 return _data_to_frame( 

830 data=data_dicts, 

831 dtype=dtype, 

832 converters=converters, 

833 parse_dates=parse_dates, 

834 **kwargs, 

835 ) 

836 

837 

838@deprecate_nonkeyword_arguments(version=None, allowed_args=["path_or_buffer"]) 

839@doc( 

840 storage_options=_shared_docs["storage_options"], 

841 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer", 

842) 

843def read_xml( 

844 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str], 

845 xpath: str = "./*", 

846 namespaces: dict[str, str] | None = None, 

847 elems_only: bool = False, 

848 attrs_only: bool = False, 

849 names: Sequence[str] | None = None, 

850 dtype: DtypeArg | None = None, 

851 converters: ConvertersArg | None = None, 

852 parse_dates: ParseDatesArg | None = None, 

853 # encoding can not be None for lxml and StringIO input 

854 encoding: str | None = "utf-8", 

855 parser: XMLParsers = "lxml", 

856 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None, 

857 iterparse: dict[str, list[str]] | None = None, 

858 compression: CompressionOptions = "infer", 

859 storage_options: StorageOptions = None, 

860) -> DataFrame: 

861 r""" 

862 Read XML document into a ``DataFrame`` object. 

863 

864 .. versionadded:: 1.3.0 

865 

866 Parameters 

867 ---------- 

868 path_or_buffer : str, path object, or file-like object 

869 String, path object (implementing ``os.PathLike[str]``), or file-like 

870 object implementing a ``read()`` function. The string can be any valid XML 

871 string or a path. The string can further be a URL. Valid URL schemes 

872 include http, ftp, s3, and file. 

873 

874 xpath : str, optional, default './\*' 

875 The XPath to parse required set of nodes for migration to DataFrame. 

876 XPath should return a collection of elements and not a single 

877 element. Note: The ``etree`` parser supports limited XPath 

878 expressions. For more complex XPath, use ``lxml`` which requires 

879 installation. 

880 

881 namespaces : dict, optional 

882 The namespaces defined in XML document as dicts with key being 

883 namespace prefix and value the URI. There is no need to include all 

884 namespaces in XML, only the ones used in ``xpath`` expression. 

885 Note: if XML document uses default namespace denoted as 

886 `xmlns='<URI>'` without a prefix, you must assign any temporary 

887 namespace prefix such as 'doc' to the URI in order to parse 

888 underlying nodes and/or attributes. For example, :: 

889 

890 namespaces = {{"doc": "https://example.com"}} 

891 

892 elems_only : bool, optional, default False 

893 Parse only the child elements at the specified ``xpath``. By default, 

894 all child elements and non-empty text nodes are returned. 

895 

896 attrs_only : bool, optional, default False 

897 Parse only the attributes at the specified ``xpath``. 

898 By default, all attributes are returned. 

899 

900 names : list-like, optional 

901 Column names for DataFrame of parsed XML data. Use this parameter to 

902 rename original element names and distinguish same named elements and 

903 attributes. 

904 

905 dtype : Type name or dict of column -> type, optional 

906 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32, 

907 'c': 'Int64'}} 

908 Use `str` or `object` together with suitable `na_values` settings 

909 to preserve and not interpret dtype. 

910 If converters are specified, they will be applied INSTEAD 

911 of dtype conversion. 

912 

913 .. versionadded:: 1.5.0 

914 

915 converters : dict, optional 

916 Dict of functions for converting values in certain columns. Keys can either 

917 be integers or column labels. 

918 

919 .. versionadded:: 1.5.0 

920 

921 parse_dates : bool or list of int or names or list of lists or dict, default False 

922 Identifiers to parse index or columns to datetime. The behavior is as follows: 

923 

924 * boolean. If True -> try parsing the index. 

925 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 

926 each as a separate date column. 

927 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as 

928 a single date column. 

929 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call 

930 result 'foo' 

931 

932 .. versionadded:: 1.5.0 

933 

934 encoding : str, optional, default 'utf-8' 

935 Encoding of XML document. 

936 

937 parser : {{'lxml','etree'}}, default 'lxml' 

938 Parser module to use for retrieval of data. Only 'lxml' and 

939 'etree' are supported. With 'lxml' more complex XPath searches 

940 and ability to use XSLT stylesheet are supported. 

941 

942 stylesheet : str, path object or file-like object 

943 A URL, file-like object, or a raw string containing an XSLT script. 

944 This stylesheet should flatten complex, deeply nested XML documents 

945 for easier parsing. To use this feature you must have ``lxml`` module 

946 installed and specify 'lxml' as ``parser``. The ``xpath`` must 

947 reference nodes of transformed XML document generated after XSLT 

948 transformation and not the original XML document. Only XSLT 1.0 

949 scripts and not later versions is currently supported. 

950 

951 iterparse : dict, optional 

952 The nodes or attributes to retrieve in iterparsing of XML document 

953 as a dict with key being the name of repeating element and value being 

954 list of elements or attribute names that are descendants of the repeated 

955 element. Note: If this option is used, it will replace ``xpath`` parsing 

956 and unlike xpath, descendants do not need to relate to each other but can 

957 exist any where in document under the repeating element. This memory- 

958 efficient method should be used for very large XML files (500MB, 1GB, or 5GB+). 

959 For example, :: 

960 

961 iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}} 

962 

963 .. versionadded:: 1.5.0 

964 

965 {decompression_options} 

966 

967 .. versionchanged:: 1.4.0 Zstandard support. 

968 

969 {storage_options} 

970 

971 Returns 

972 ------- 

973 df 

974 A DataFrame. 

975 

976 See Also 

977 -------- 

978 read_json : Convert a JSON string to pandas object. 

979 read_html : Read HTML tables into a list of DataFrame objects. 

980 

981 Notes 

982 ----- 

983 This method is best designed to import shallow XML documents in 

984 following format which is the ideal fit for the two-dimensions of a 

985 ``DataFrame`` (row by column). :: 

986 

987 <root> 

988 <row> 

989 <column1>data</column1> 

990 <column2>data</column2> 

991 <column3>data</column3> 

992 ... 

993 </row> 

994 <row> 

995 ... 

996 </row> 

997 ... 

998 </root> 

999 

1000 As a file format, XML documents can be designed any way including 

1001 layout of elements and attributes as long as it conforms to W3C 

1002 specifications. Therefore, this method is a convenience handler for 

1003 a specific flatter design and not all possible XML structures. 

1004 

1005 However, for more complex XML documents, ``stylesheet`` allows you to 

1006 temporarily redesign original document with XSLT (a special purpose 

1007 language) for a flatter version for migration to a DataFrame. 

1008 

1009 This function will *always* return a single :class:`DataFrame` or raise 

1010 exceptions due to issues with XML document, ``xpath``, or other 

1011 parameters. 

1012 

1013 See the :ref:`read_xml documentation in the IO section of the docs 

1014 <io.read_xml>` for more information in using this method to parse XML 

1015 files to DataFrames. 

1016 

1017 Examples 

1018 -------- 

1019 >>> xml = '''<?xml version='1.0' encoding='utf-8'?> 

1020 ... <data xmlns="http://example.com"> 

1021 ... <row> 

1022 ... <shape>square</shape> 

1023 ... <degrees>360</degrees> 

1024 ... <sides>4.0</sides> 

1025 ... </row> 

1026 ... <row> 

1027 ... <shape>circle</shape> 

1028 ... <degrees>360</degrees> 

1029 ... <sides/> 

1030 ... </row> 

1031 ... <row> 

1032 ... <shape>triangle</shape> 

1033 ... <degrees>180</degrees> 

1034 ... <sides>3.0</sides> 

1035 ... </row> 

1036 ... </data>''' 

1037 

1038 >>> df = pd.read_xml(xml) 

1039 >>> df 

1040 shape degrees sides 

1041 0 square 360 4.0 

1042 1 circle 360 NaN 

1043 2 triangle 180 3.0 

1044 

1045 >>> xml = '''<?xml version='1.0' encoding='utf-8'?> 

1046 ... <data> 

1047 ... <row shape="square" degrees="360" sides="4.0"/> 

1048 ... <row shape="circle" degrees="360"/> 

1049 ... <row shape="triangle" degrees="180" sides="3.0"/> 

1050 ... </data>''' 

1051 

1052 >>> df = pd.read_xml(xml, xpath=".//row") 

1053 >>> df 

1054 shape degrees sides 

1055 0 square 360 4.0 

1056 1 circle 360 NaN 

1057 2 triangle 180 3.0 

1058 

1059 >>> xml = '''<?xml version='1.0' encoding='utf-8'?> 

1060 ... <doc:data xmlns:doc="https://example.com"> 

1061 ... <doc:row> 

1062 ... <doc:shape>square</doc:shape> 

1063 ... <doc:degrees>360</doc:degrees> 

1064 ... <doc:sides>4.0</doc:sides> 

1065 ... </doc:row> 

1066 ... <doc:row> 

1067 ... <doc:shape>circle</doc:shape> 

1068 ... <doc:degrees>360</doc:degrees> 

1069 ... <doc:sides/> 

1070 ... </doc:row> 

1071 ... <doc:row> 

1072 ... <doc:shape>triangle</doc:shape> 

1073 ... <doc:degrees>180</doc:degrees> 

1074 ... <doc:sides>3.0</doc:sides> 

1075 ... </doc:row> 

1076 ... </doc:data>''' 

1077 

1078 >>> df = pd.read_xml(xml, 

1079 ... xpath="//doc:row", 

1080 ... namespaces={{"doc": "https://example.com"}}) 

1081 >>> df 

1082 shape degrees sides 

1083 0 square 360 4.0 

1084 1 circle 360 NaN 

1085 2 triangle 180 3.0 

1086 """ 

1087 

1088 return _parse( 

1089 path_or_buffer=path_or_buffer, 

1090 xpath=xpath, 

1091 namespaces=namespaces, 

1092 elems_only=elems_only, 

1093 attrs_only=attrs_only, 

1094 names=names, 

1095 dtype=dtype, 

1096 converters=converters, 

1097 parse_dates=parse_dates, 

1098 encoding=encoding, 

1099 parser=parser, 

1100 stylesheet=stylesheet, 

1101 iterparse=iterparse, 

1102 compression=compression, 

1103 storage_options=storage_options, 

1104 )