Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/xml.py: 13%

1"""

2:mod:`pandas.io.xml` is a module for reading XML.

3"""

5from __future__ import annotations

7import io

8from typing import (

9 Any,

10 Callable,

11 Sequence,

12)

14from pandas._typing import (

15 TYPE_CHECKING,

16 CompressionOptions,

17 ConvertersArg,

18 DtypeArg,

19 FilePath,

20 ParseDatesArg,

21 ReadBuffer,

22 StorageOptions,

23 XMLParsers,

24)

25from pandas.compat._optional import import_optional_dependency

26from pandas.errors import (

27 AbstractMethodError,

28 ParserError,

29)

30from pandas.util._decorators import (

31 deprecate_nonkeyword_arguments,

32 doc,

33)

35from pandas.core.dtypes.common import is_list_like

37from pandas.core.shared_docs import _shared_docs

39from pandas.io.common import (

40 file_exists,

41 get_handle,

42 infer_compression,

43 is_fsspec_url,

44 is_url,

45 stringify_path,

46)

47from pandas.io.parsers import TextParser

49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true

50 from xml.etree.ElementTree import Element

52 from lxml.etree import (

53 _Element,

54 _XSLTResultTree,

55 )

57 from pandas import DataFrame

60@doc(

61 storage_options=_shared_docs["storage_options"],

62 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",

63)

64class _XMLFrameParser:

65 """

66 Internal subclass to parse XML into DataFrames.

68 Parameters

69 ----------

70 path_or_buffer : a valid JSON str, path object or file-like object

71 Any valid string path is acceptable. The string could be a URL. Valid

72 URL schemes include http, ftp, s3, and file.

74 xpath : str or regex

75 The XPath expression to parse required set of nodes for

76 migration to `Data Frame`. `etree` supports limited XPath.

78 namespaces : dict

79 The namespaces defined in XML document (`xmlns:namespace='URI')

80 as dicts with key being namespace and value the URI.

82 elems_only : bool

83 Parse only the child elements at the specified `xpath`.

85 attrs_only : bool

86 Parse only the attributes at the specified `xpath`.

88 names : list

89 Column names for Data Frame of parsed XML data.

91 dtype : dict

92 Data type for data or columns. E.g. {{'a': np.float64,

93 'b': np.int32, 'c': 'Int64'}}

95 .. versionadded:: 1.5.0

97 converters : dict, optional

98 Dict of functions for converting values in certain columns. Keys can

99 either be integers or column labels.

100

101 .. versionadded:: 1.5.0

102

103 parse_dates : bool or list of int or names or list of lists or dict

104 Converts either index or select columns to datetimes

105

106 .. versionadded:: 1.5.0

107

108 encoding : str

109 Encoding of xml object or document.

110

111 stylesheet : str or file-like

112 URL, file, file-like object, or a raw string containing XSLT,

113 `etree` does not support XSLT but retained for consistency.

114

115 iterparse : dict, optional

116 Dict with row element as key and list of descendant elements

117 and/or attributes as value to be retrieved in iterparsing of

118 XML document.

119

120 .. versionadded:: 1.5.0

121

122 {decompression_options}

123

124 .. versionchanged:: 1.4.0 Zstandard support.

125

126 {storage_options}

127

128 See also

129 --------

130 pandas.io.xml._EtreeFrameParser

131 pandas.io.xml._LxmlFrameParser

132

133 Notes

134 -----

135 To subclass this class effectively you must override the following methods:`

136 * :func:`parse_data`

137 * :func:`_parse_nodes`

138 * :func:`_iterparse_nodes`

139 * :func:`_parse_doc`

140 * :func:`_validate_names`

141 * :func:`_validate_path`

142

143

144 See each method's respective documentation for details on their

145 functionality.

146 """

147

148 def __init__(

149 self,

150 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],

151 xpath: str,

152 namespaces: dict[str, str] | None,

153 elems_only: bool,

154 attrs_only: bool,

155 names: Sequence[str] | None,

156 dtype: DtypeArg | None,

157 converters: ConvertersArg | None,

158 parse_dates: ParseDatesArg | None,

159 encoding: str | None,

160 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,

161 iterparse: dict[str, list[str]] | None,

162 compression: CompressionOptions,

163 storage_options: StorageOptions,

164 ) -> None:

165 self.path_or_buffer = path_or_buffer

166 self.xpath = xpath

167 self.namespaces = namespaces

168 self.elems_only = elems_only

169 self.attrs_only = attrs_only

170 self.names = names

171 self.dtype = dtype

172 self.converters = converters

173 self.parse_dates = parse_dates

174 self.encoding = encoding

175 self.stylesheet = stylesheet

176 self.iterparse = iterparse

177 self.is_style = None

178 self.compression = compression

179 self.storage_options = storage_options

180

181 def parse_data(self) -> list[dict[str, str | None]]:

182 """

183 Parse xml data.

184

185 This method will call the other internal methods to

186 validate xpath, names, parse and return specific nodes.

187 """

188

189 raise AbstractMethodError(self)

190

191 def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:

192 """

193 Parse xml nodes.

194

195 This method will parse the children and attributes of elements

196 in xpath, conditionally for only elements, only attributes

197 or both while optionally renaming node names.

198

199 Raises

200 ------

201 ValueError

202 * If only elements and only attributes are specified.

203

204 Notes

205 -----

206 Namespace URIs will be removed from return node values. Also,

207 elements with missing children or attributes compared to siblings

208 will have optional keys filled with None values.

209 """

210

211 dicts: list[dict[str, str | None]]

212

213 if self.elems_only and self.attrs_only:

214 raise ValueError("Either element or attributes can be parsed not both.")

215 elif self.elems_only:

216 if self.names:

217 dicts = [

218 {

219 **(

220 {el.tag: el.text.strip()}

221 if el.text and not el.text.isspace()

222 else {}

223 ),

224 **{

225 nm: ch.text.strip() if ch.text else None

226 for nm, ch in zip(self.names, el.findall("*"))

227 },

228 }

229 for el in elems

230 ]

231 else:

232 dicts = [

233 {

234 ch.tag: ch.text.strip() if ch.text else None

235 for ch in el.findall("*")

236 }

237 for el in elems

238 ]

239

240 elif self.attrs_only:

241 dicts = [

242 {k: v.strip() if v else None for k, v in el.attrib.items()}

243 for el in elems

244 ]

245

246 else:

247 if self.names:

248 dicts = [

249 {

250 **el.attrib,

251 **(

252 {el.tag: el.text.strip()}

253 if el.text and not el.text.isspace()

254 else {}

255 ),

256 **{

257 nm: ch.text.strip() if ch.text else None

258 for nm, ch in zip(self.names, el.findall("*"))

259 },

260 }

261 for el in elems

262 ]

263

264 else:

265 dicts = [

266 {

267 **el.attrib,

268 **(

269 {el.tag: el.text.strip()}

270 if el.text and not el.text.isspace()

271 else {}

272 ),

273 **{

274 ch.tag: ch.text.strip() if ch.text else None

275 for ch in el.findall("*")

276 },

277 }

278 for el in elems

279 ]

280

281 dicts = [

282 {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts

283 ]

284

285 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))

286 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

287

288 if self.names:

289 dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]

290

291 return dicts

292

293 def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:

294 """

295 Iterparse xml nodes.

296

297 This method will read in local disk, decompressed XML files for elements

298 and underlying descendants using iterparse, a method to iterate through

299 an XML tree without holding entire XML tree in memory.

300

301 Raises

302 ------

303 TypeError

304 * If `iterparse` is not a dict or its dict value is not list-like.

305 ParserError

306 * If `path_or_buffer` is not a physical, decompressed file on disk.

307 * If no data is returned from selected items in `iterparse`.

308

309 Notes

310 -----

311 Namespace URIs will be removed from return node values. Also,

312 elements with missing children or attributes in submitted list

313 will have optional keys filled with None values.

314 """

315

316 dicts: list[dict[str, str | None]] = []

317 row: dict[str, str | None] | None = None

318

319 if not isinstance(self.iterparse, dict):

320 raise TypeError(

321 f"{type(self.iterparse).__name__} is not a valid type for iterparse"

322 )

323

324 row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""

325 if not is_list_like(self.iterparse[row_node]):

326 raise TypeError(

327 f"{type(self.iterparse[row_node])} is not a valid type "

328 "for value in iterparse"

329 )

330

331 if (

332 not isinstance(self.path_or_buffer, str)

333 or is_url(self.path_or_buffer)

334 or is_fsspec_url(self.path_or_buffer)

335 or self.path_or_buffer.startswith(("<?xml", "<"))

336 or infer_compression(self.path_or_buffer, "infer") is not None

337 ):

338 raise ParserError(

339 "iterparse is designed for large XML files that are fully extracted on "

340 "local disk and not as compressed files or online sources."

341 )

342

343 for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):

344 curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag

345

346 if event == "start":

347 if curr_elem == row_node:

348 row = {}

349

350 if row is not None:

351 if self.names:

352 for col, nm in zip(self.iterparse[row_node], self.names):

353 if curr_elem == col:

354 elem_val = elem.text.strip() if elem.text else None

355 if row.get(nm) != elem_val and nm not in row:

356 row[nm] = elem_val

357 if col in elem.attrib:

358 if elem.attrib[col] not in row.values() and nm not in row:

359 row[nm] = elem.attrib[col]

360 else:

361 for col in self.iterparse[row_node]:

362 if curr_elem == col:

363 row[col] = elem.text.strip() if elem.text else None

364 if col in elem.attrib:

365 row[col] = elem.attrib[col]

366

367 if event == "end":

368 if curr_elem == row_node and row is not None:

369 dicts.append(row)

370 row = None

371

372 elem.clear()

373 if hasattr(elem, "getprevious"):

374 while (

375 elem.getprevious() is not None and elem.getparent() is not None

376 ):

377 del elem.getparent()[0]

378

379 if dicts == []:

380 raise ParserError("No result from selected items in iterparse.")

381

382 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))

383 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

384

385 if self.names:

386 dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]

387

388 return dicts

389

390 def _validate_path(self) -> None:

391 """

392 Validate xpath.

393

394 This method checks for syntax, evaluation, or empty nodes return.

395

396 Raises

397 ------

398 SyntaxError

399 * If xpah is not supported or issues with namespaces.

400

401 ValueError

402 * If xpah does not return any nodes.

403 """

404

405 raise AbstractMethodError(self)

406

407 def _validate_names(self) -> None:

408 """

409 Validate names.

410

411 This method will check if names is a list-like and aligns

412 with length of parse nodes.

413

414 Raises

415 ------

416 ValueError

417 * If value is not a list and less then length of nodes.

418 """

419 raise AbstractMethodError(self)

420

421 def _parse_doc(

422 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]

423 ) -> Element | _Element:

424 """

425 Build tree from path_or_buffer.

426

427 This method will parse XML object into tree

428 either from string/bytes or file location.

429 """

430 raise AbstractMethodError(self)

431

432

433class _EtreeFrameParser(_XMLFrameParser):

434 """

435 Internal class to parse XML into DataFrames with the Python

436 standard library XML module: `xml.etree.ElementTree`.

437 """

438

439 def parse_data(self) -> list[dict[str, str | None]]:

440 from xml.etree.ElementTree import iterparse

441

442 if self.stylesheet is not None:

443 raise ValueError(

444 "To use stylesheet, you need lxml installed and selected as parser."

445 )

446

447 if self.iterparse is None:

448 self.xml_doc = self._parse_doc(self.path_or_buffer)

449 self._validate_path()

450 elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)

451

452 self._validate_names()

453

454 xml_dicts: list[dict[str, str | None]] = (

455 self._parse_nodes(elems)

456 if self.iterparse is None

457 else self._iterparse_nodes(iterparse)

458 )

459

460 return xml_dicts

461

462 def _validate_path(self) -> None:

463 """

464 Notes

465 -----

466 `etree` supports limited XPath. If user attempts a more complex

467 expression syntax error will raise.

468 """

469

470 msg = (

471 "xpath does not return any nodes. "

472 "If document uses namespaces denoted with "

473 "xmlns, be sure to define namespaces and "

474 "use them in xpath."

475 )

476 try:

477 elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)

478 if elems is None:

479 raise ValueError(msg)

480

481 if elems is not None and elems.find("*") is None and elems.attrib is None:

482 raise ValueError(msg)

483

484 except (KeyError, SyntaxError):

485 raise SyntaxError(

486 "You have used an incorrect or unsupported XPath "

487 "expression for etree library or you used an "

488 "undeclared namespace prefix."

489 )

490

491 def _validate_names(self) -> None:

492 children: list[Any]

493

494 if self.names:

495 if self.iterparse:

496 children = self.iterparse[next(iter(self.iterparse))]

497 else:

498 parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)

499 children = parent.findall("*") if parent else []

500

501 if is_list_like(self.names):

502 if len(self.names) < len(children):

503 raise ValueError(

504 "names does not match length of child elements in xpath."

505 )

506 else:

507 raise TypeError(

508 f"{type(self.names).__name__} is not a valid type for names"

509 )

510

511 def _parse_doc(

512 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]

513 ) -> Element:

514 from xml.etree.ElementTree import (

515 XMLParser,

516 parse,

517 )

518

519 handle_data = get_data_from_filepath(

520 filepath_or_buffer=raw_doc,

521 encoding=self.encoding,

522 compression=self.compression,

523 storage_options=self.storage_options,

524 )

525

526 with preprocess_data(handle_data) as xml_data:

527 curr_parser = XMLParser(encoding=self.encoding)

528 doc = parse(xml_data, parser=curr_parser)

529

530 return doc.getroot()

531

532

533class _LxmlFrameParser(_XMLFrameParser):

534 """

535 Internal class to parse XML into DataFrames with third-party

536 full-featured XML library, `lxml`, that supports

537 XPath 1.0 and XSLT 1.0.

538 """

539

540 def parse_data(self) -> list[dict[str, str | None]]:

541 """

542 Parse xml data.

543

544 This method will call the other internal methods to

545 validate xpath, names, optionally parse and run XSLT,

546 and parse original or transformed XML and return specific nodes.

547 """

548 from lxml.etree import iterparse

549

550 if self.iterparse is None:

551 self.xml_doc = self._parse_doc(self.path_or_buffer)

552

553 if self.stylesheet:

554 self.xsl_doc = self._parse_doc(self.stylesheet)

555 self.xml_doc = self._transform_doc()

556

557 self._validate_path()

558 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)

559

560 self._validate_names()

561

562 xml_dicts: list[dict[str, str | None]] = (

563 self._parse_nodes(elems)

564 if self.iterparse is None

565 else self._iterparse_nodes(iterparse)

566 )

567

568 return xml_dicts

569

570 def _validate_path(self) -> None:

571

572 msg = (

573 "xpath does not return any nodes. "

574 "Be sure row level nodes are in xpath. "

575 "If document uses namespaces denoted with "

576 "xmlns, be sure to define namespaces and "

577 "use them in xpath."

578 )

579

580 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)

581 children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)

582 attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)

583

584 if elems == []:

585 raise ValueError(msg)

586

587 if elems != [] and attrs == [] and children == []:

588 raise ValueError(msg)

589

590 def _validate_names(self) -> None:

591 children: list[Any]

592

593 if self.names:

594 if self.iterparse:

595 children = self.iterparse[next(iter(self.iterparse))]

596 else:

597 children = self.xml_doc.xpath(

598 self.xpath + "[1]/*", namespaces=self.namespaces

599 )

600

601 if is_list_like(self.names):

602 if len(self.names) < len(children):

603 raise ValueError(

604 "names does not match length of child elements in xpath."

605 )

606 else:

607 raise TypeError(

608 f"{type(self.names).__name__} is not a valid type for names"

609 )

610

611 def _parse_doc(

612 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]

613 ) -> _Element:

614 from lxml.etree import (

615 XMLParser,

616 fromstring,

617 parse,

618 )

619

620 handle_data = get_data_from_filepath(

621 filepath_or_buffer=raw_doc,

622 encoding=self.encoding,

623 compression=self.compression,

624 storage_options=self.storage_options,

625 )

626

627 with preprocess_data(handle_data) as xml_data:

628 curr_parser = XMLParser(encoding=self.encoding)

629

630 if isinstance(xml_data, io.StringIO):

631 if self.encoding is None:

632 raise TypeError(

633 "Can not pass encoding None when input is StringIO."

634 )

635

636 doc = fromstring(

637 xml_data.getvalue().encode(self.encoding), parser=curr_parser

638 )

639 else:

640 doc = parse(xml_data, parser=curr_parser)

641

642 return doc

643

644 def _transform_doc(self) -> _XSLTResultTree:

645 """

646 Transform original tree using stylesheet.

647

648 This method will transform original xml using XSLT script into

649 am ideally flatter xml document for easier parsing and migration

650 to Data Frame.

651 """

652 from lxml.etree import XSLT

653

654 transformer = XSLT(self.xsl_doc)

655 new_doc = transformer(self.xml_doc)

656

657 return new_doc

658

659

660def get_data_from_filepath(

661 filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],

662 encoding: str | None,

663 compression: CompressionOptions,

664 storage_options: StorageOptions,

665) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:

666 """

667 Extract raw XML data.

668

669 The method accepts three input types:

670 1. filepath (string-like)

671 2. file-like object (e.g. open file object, StringIO)

672 3. XML string or bytes

673

674 This method turns (1) into (2) to simplify the rest of the processing.

675 It returns input types (2) and (3) unchanged.

676 """

677 if not isinstance(filepath_or_buffer, bytes):

678 filepath_or_buffer = stringify_path(filepath_or_buffer)

679

680 if (

681 isinstance(filepath_or_buffer, str)

682 and not filepath_or_buffer.startswith(("<?xml", "<"))

683 ) and (

684 not isinstance(filepath_or_buffer, str)

685 or is_url(filepath_or_buffer)

686 or is_fsspec_url(filepath_or_buffer)

687 or file_exists(filepath_or_buffer)

688 ):

689 with get_handle(

690 filepath_or_buffer,

691 "r",

692 encoding=encoding,

693 compression=compression,

694 storage_options=storage_options,

695 ) as handle_obj:

696 filepath_or_buffer = (

697 # error: Incompatible types in assignment (expression has type

698 # "Union[str, IO[str]]", variable has type "Union[Union[str,

699 # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]")

700 handle_obj.handle.read() # type: ignore[assignment]

701 if hasattr(handle_obj.handle, "read")

702 else handle_obj.handle

703 )

704

705 return filepath_or_buffer

706

707

708def preprocess_data(data) -> io.StringIO | io.BytesIO:

709 """

710 Convert extracted raw data.

711

712 This method will return underlying data of extracted XML content.

713 The data either has a `read` attribute (e.g. a file object or a

714 StringIO/BytesIO) or is a string or bytes that is an XML document.

715 """

716

717 if isinstance(data, str):

718 data = io.StringIO(data)

719

720 elif isinstance(data, bytes):

721 data = io.BytesIO(data)

722

723 return data

724

725

726def _data_to_frame(data, **kwargs) -> DataFrame:

727 """

728 Convert parsed data to Data Frame.

729

730 This method will bind xml dictionary data of keys and values

731 into named columns of Data Frame using the built-in TextParser

732 class that build Data Frame and infers specific dtypes.

733 """

734

735 tags = next(iter(data))

736 nodes = [list(d.values()) for d in data]

737

738 try:

739 with TextParser(nodes, names=tags, **kwargs) as tp:

740 return tp.read()

741 except ParserError:

742 raise ParserError(

743 "XML document may be too complex for import. "

744 "Try to flatten document and use distinct "

745 "element and attribute names."

746 )

747

748

749def _parse(

750 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],

751 xpath: str,

752 namespaces: dict[str, str] | None,

753 elems_only: bool,

754 attrs_only: bool,

755 names: Sequence[str] | None,

756 dtype: DtypeArg | None,

757 converters: ConvertersArg | None,

758 parse_dates: ParseDatesArg | None,

759 encoding: str | None,

760 parser: XMLParsers,

761 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,

762 iterparse: dict[str, list[str]] | None,

763 compression: CompressionOptions,

764 storage_options: StorageOptions,

765 **kwargs,

766) -> DataFrame:

767 """

768 Call internal parsers.

769

770 This method will conditionally call internal parsers:

771 LxmlFrameParser and/or EtreeParser.

772

773 Raises

774 ------

775 ImportError

776 * If lxml is not installed if selected as parser.

777

778 ValueError

779 * If parser is not lxml or etree.

780 """

781

782 p: _EtreeFrameParser | _LxmlFrameParser

783

784 if parser == "lxml":

785 lxml = import_optional_dependency("lxml.etree", errors="ignore")

786

787 if lxml is not None:

788 p = _LxmlFrameParser(

789 path_or_buffer,

790 xpath,

791 namespaces,

792 elems_only,

793 attrs_only,

794 names,

795 dtype,

796 converters,

797 parse_dates,

798 encoding,

799 stylesheet,

800 iterparse,

801 compression,

802 storage_options,

803 )

804 else:

805 raise ImportError("lxml not found, please install or use the etree parser.")

806

807 elif parser == "etree":

808 p = _EtreeFrameParser(

809 path_or_buffer,

810 xpath,

811 namespaces,

812 elems_only,

813 attrs_only,

814 names,

815 dtype,

816 converters,

817 parse_dates,

818 encoding,

819 stylesheet,

820 iterparse,

821 compression,

822 storage_options,

823 )

824 else:

825 raise ValueError("Values for parser can only be lxml or etree.")

826

827 data_dicts = p.parse_data()

828

829 return _data_to_frame(

830 data=data_dicts,

831 dtype=dtype,

832 converters=converters,

833 parse_dates=parse_dates,

834 **kwargs,

835 )

836

837

838@deprecate_nonkeyword_arguments(version=None, allowed_args=["path_or_buffer"])

839@doc(

840 storage_options=_shared_docs["storage_options"],

841 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",

842)

843def read_xml(

844 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],

845 xpath: str = "./*",

846 namespaces: dict[str, str] | None = None,

847 elems_only: bool = False,

848 attrs_only: bool = False,

849 names: Sequence[str] | None = None,

850 dtype: DtypeArg | None = None,

851 converters: ConvertersArg | None = None,

852 parse_dates: ParseDatesArg | None = None,

853 # encoding can not be None for lxml and StringIO input

854 encoding: str | None = "utf-8",

855 parser: XMLParsers = "lxml",

856 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,

857 iterparse: dict[str, list[str]] | None = None,

858 compression: CompressionOptions = "infer",

859 storage_options: StorageOptions = None,

860) -> DataFrame:

861 r"""

862 Read XML document into a ``DataFrame`` object.

863

864 .. versionadded:: 1.3.0

865

866 Parameters

867 ----------

868 path_or_buffer : str, path object, or file-like object

869 String, path object (implementing ``os.PathLike[str]``), or file-like

870 object implementing a ``read()`` function. The string can be any valid XML

871 string or a path. The string can further be a URL. Valid URL schemes

872 include http, ftp, s3, and file.

873

874 xpath : str, optional, default './\*'

875 The XPath to parse required set of nodes for migration to DataFrame.

876 XPath should return a collection of elements and not a single

877 element. Note: The ``etree`` parser supports limited XPath

878 expressions. For more complex XPath, use ``lxml`` which requires

879 installation.

880

881 namespaces : dict, optional

882 The namespaces defined in XML document as dicts with key being

883 namespace prefix and value the URI. There is no need to include all

884 namespaces in XML, only the ones used in ``xpath`` expression.

885 Note: if XML document uses default namespace denoted as

886 `xmlns='<URI>'` without a prefix, you must assign any temporary

887 namespace prefix such as 'doc' to the URI in order to parse

888 underlying nodes and/or attributes. For example, ::

889

890 namespaces = {{"doc": "https://example.com"}}

891

892 elems_only : bool, optional, default False

893 Parse only the child elements at the specified ``xpath``. By default,

894 all child elements and non-empty text nodes are returned.

895

896 attrs_only : bool, optional, default False

897 Parse only the attributes at the specified ``xpath``.

898 By default, all attributes are returned.

899

900 names : list-like, optional

901 Column names for DataFrame of parsed XML data. Use this parameter to

902 rename original element names and distinguish same named elements and

903 attributes.

904

905 dtype : Type name or dict of column -> type, optional

906 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,

907 'c': 'Int64'}}

908 Use `str` or `object` together with suitable `na_values` settings

909 to preserve and not interpret dtype.

910 If converters are specified, they will be applied INSTEAD

911 of dtype conversion.

912

913 .. versionadded:: 1.5.0

914

915 converters : dict, optional

916 Dict of functions for converting values in certain columns. Keys can either

917 be integers or column labels.

918

919 .. versionadded:: 1.5.0

920

921 parse_dates : bool or list of int or names or list of lists or dict, default False

922 Identifiers to parse index or columns to datetime. The behavior is as follows:

923

924 * boolean. If True -> try parsing the index.

925 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3

926 each as a separate date column.

927 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as

928 a single date column.

929 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call

930 result 'foo'

931

932 .. versionadded:: 1.5.0

933

934 encoding : str, optional, default 'utf-8'

935 Encoding of XML document.

936

937 parser : {{'lxml','etree'}}, default 'lxml'

938 Parser module to use for retrieval of data. Only 'lxml' and

939 'etree' are supported. With 'lxml' more complex XPath searches

940 and ability to use XSLT stylesheet are supported.

941

942 stylesheet : str, path object or file-like object

943 A URL, file-like object, or a raw string containing an XSLT script.

944 This stylesheet should flatten complex, deeply nested XML documents

945 for easier parsing. To use this feature you must have ``lxml`` module

946 installed and specify 'lxml' as ``parser``. The ``xpath`` must

947 reference nodes of transformed XML document generated after XSLT

948 transformation and not the original XML document. Only XSLT 1.0

949 scripts and not later versions is currently supported.

950

951 iterparse : dict, optional

952 The nodes or attributes to retrieve in iterparsing of XML document

953 as a dict with key being the name of repeating element and value being

954 list of elements or attribute names that are descendants of the repeated

955 element. Note: If this option is used, it will replace ``xpath`` parsing

956 and unlike xpath, descendants do not need to relate to each other but can

957 exist any where in document under the repeating element. This memory-

958 efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).

959 For example, ::

960

961 iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}}

962

963 .. versionadded:: 1.5.0

964

965 {decompression_options}

966

967 .. versionchanged:: 1.4.0 Zstandard support.

968

969 {storage_options}

970

971 Returns

972 -------

973 df

974 A DataFrame.

975

976 See Also

977 --------

978 read_json : Convert a JSON string to pandas object.

979 read_html : Read HTML tables into a list of DataFrame objects.

980

981 Notes

982 -----

983 This method is best designed to import shallow XML documents in

984 following format which is the ideal fit for the two-dimensions of a

985 ``DataFrame`` (row by column). ::

986

987 <root>

988 <row>

989 <column1>data</column1>

990 <column2>data</column2>

991 <column3>data</column3>

992 ...

993 </row>

994 <row>

995 ...

996 </row>

997 ...

998 </root>

999

1000 As a file format, XML documents can be designed any way including

1001 layout of elements and attributes as long as it conforms to W3C

1002 specifications. Therefore, this method is a convenience handler for

1003 a specific flatter design and not all possible XML structures.

1004

1005 However, for more complex XML documents, ``stylesheet`` allows you to

1006 temporarily redesign original document with XSLT (a special purpose

1007 language) for a flatter version for migration to a DataFrame.

1008

1009 This function will *always* return a single :class:`DataFrame` or raise

1010 exceptions due to issues with XML document, ``xpath``, or other

1011 parameters.

1012

1013 See the :ref:`read_xml documentation in the IO section of the docs

1014 <io.read_xml>` for more information in using this method to parse XML

1015 files to DataFrames.

1016

1017 Examples

1018 --------

1019 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>

1020 ... <data xmlns="http://example.com">

1021 ... <row>

1022 ... <shape>square</shape>

1023 ... <degrees>360</degrees>

1024 ... <sides>4.0</sides>

1025 ... </row>

1026 ... <row>

1027 ... <shape>circle</shape>

1028 ... <degrees>360</degrees>

1029 ... <sides/>

1030 ... </row>

1031 ... <row>

1032 ... <shape>triangle</shape>

1033 ... <degrees>180</degrees>

1034 ... <sides>3.0</sides>

1035 ... </row>

1036 ... </data>'''

1037

1038 >>> df = pd.read_xml(xml)

1039 >>> df

1040 shape degrees sides

1041 0 square 360 4.0

1042 1 circle 360 NaN

1043 2 triangle 180 3.0

1044

1045 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>

1046 ... <data>

1047 ... <row shape="square" degrees="360" sides="4.0"/>

1048 ... <row shape="circle" degrees="360"/>

1049 ... <row shape="triangle" degrees="180" sides="3.0"/>

1050 ... </data>'''

1051

1052 >>> df = pd.read_xml(xml, xpath=".//row")

1053 >>> df

1054 shape degrees sides

1055 0 square 360 4.0

1056 1 circle 360 NaN

1057 2 triangle 180 3.0

1058

1059 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>

1060 ... <doc:data xmlns:doc="https://example.com">

1061 ... <doc:row>

1062 ... <doc:shape>square</doc:shape>

1063 ... <doc:degrees>360</doc:degrees>

1064 ... <doc:sides>4.0</doc:sides>

1065 ... </doc:row>

1066 ... <doc:row>

1067 ... <doc:shape>circle</doc:shape>

1068 ... <doc:degrees>360</doc:degrees>

1069 ... <doc:sides/>

1070 ... </doc:row>

1071 ... <doc:row>

1072 ... <doc:shape>triangle</doc:shape>

1073 ... <doc:degrees>180</doc:degrees>

1074 ... <doc:sides>3.0</doc:sides>

1075 ... </doc:row>

1076 ... </doc:data>'''

1077

1078 >>> df = pd.read_xml(xml,

1079 ... xpath="//doc:row",

1080 ... namespaces={{"doc": "https://example.com"}})

1081 >>> df

1082 shape degrees sides

1083 0 square 360 4.0

1084 1 circle 360 NaN

1085 2 triangle 180 3.0

1086 """

1087

1088 return _parse(

1089 path_or_buffer=path_or_buffer,

1090 xpath=xpath,

1091 namespaces=namespaces,

1092 elems_only=elems_only,

1093 attrs_only=attrs_only,

1094 names=names,

1095 dtype=dtype,

1096 converters=converters,

1097 parse_dates=parse_dates,

1098 encoding=encoding,

1099 parser=parser,

1100 stylesheet=stylesheet,

1101 iterparse=iterparse,

1102 compression=compression,

1103 storage_options=storage_options,

1104 )