Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/xml.py: 13%
226 statements
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
« prev ^ index » next coverage.py v6.4.4, created at 2023-07-17 14:22 -0600
1"""
2:mod:`pandas.io.xml` is a module for reading XML.
3"""
5from __future__ import annotations
7import io
8from typing import (
9 Any,
10 Callable,
11 Sequence,
12)
14from pandas._typing import (
15 TYPE_CHECKING,
16 CompressionOptions,
17 ConvertersArg,
18 DtypeArg,
19 FilePath,
20 ParseDatesArg,
21 ReadBuffer,
22 StorageOptions,
23 XMLParsers,
24)
25from pandas.compat._optional import import_optional_dependency
26from pandas.errors import (
27 AbstractMethodError,
28 ParserError,
29)
30from pandas.util._decorators import (
31 deprecate_nonkeyword_arguments,
32 doc,
33)
35from pandas.core.dtypes.common import is_list_like
37from pandas.core.shared_docs import _shared_docs
39from pandas.io.common import (
40 file_exists,
41 get_handle,
42 infer_compression,
43 is_fsspec_url,
44 is_url,
45 stringify_path,
46)
47from pandas.io.parsers import TextParser
49if TYPE_CHECKING: 49 ↛ 50line 49 didn't jump to line 50, because the condition on line 49 was never true
50 from xml.etree.ElementTree import Element
52 from lxml.etree import (
53 _Element,
54 _XSLTResultTree,
55 )
57 from pandas import DataFrame
60@doc(
61 storage_options=_shared_docs["storage_options"],
62 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
63)
64class _XMLFrameParser:
65 """
66 Internal subclass to parse XML into DataFrames.
68 Parameters
69 ----------
70 path_or_buffer : a valid JSON str, path object or file-like object
71 Any valid string path is acceptable. The string could be a URL. Valid
72 URL schemes include http, ftp, s3, and file.
74 xpath : str or regex
75 The XPath expression to parse required set of nodes for
76 migration to `Data Frame`. `etree` supports limited XPath.
78 namespaces : dict
79 The namespaces defined in XML document (`xmlns:namespace='URI')
80 as dicts with key being namespace and value the URI.
82 elems_only : bool
83 Parse only the child elements at the specified `xpath`.
85 attrs_only : bool
86 Parse only the attributes at the specified `xpath`.
88 names : list
89 Column names for Data Frame of parsed XML data.
91 dtype : dict
92 Data type for data or columns. E.g. {{'a': np.float64,
93 'b': np.int32, 'c': 'Int64'}}
95 .. versionadded:: 1.5.0
97 converters : dict, optional
98 Dict of functions for converting values in certain columns. Keys can
99 either be integers or column labels.
101 .. versionadded:: 1.5.0
103 parse_dates : bool or list of int or names or list of lists or dict
104 Converts either index or select columns to datetimes
106 .. versionadded:: 1.5.0
108 encoding : str
109 Encoding of xml object or document.
111 stylesheet : str or file-like
112 URL, file, file-like object, or a raw string containing XSLT,
113 `etree` does not support XSLT but retained for consistency.
115 iterparse : dict, optional
116 Dict with row element as key and list of descendant elements
117 and/or attributes as value to be retrieved in iterparsing of
118 XML document.
120 .. versionadded:: 1.5.0
122 {decompression_options}
124 .. versionchanged:: 1.4.0 Zstandard support.
126 {storage_options}
128 See also
129 --------
130 pandas.io.xml._EtreeFrameParser
131 pandas.io.xml._LxmlFrameParser
133 Notes
134 -----
135 To subclass this class effectively you must override the following methods:`
136 * :func:`parse_data`
137 * :func:`_parse_nodes`
138 * :func:`_iterparse_nodes`
139 * :func:`_parse_doc`
140 * :func:`_validate_names`
141 * :func:`_validate_path`
144 See each method's respective documentation for details on their
145 functionality.
146 """
148 def __init__(
149 self,
150 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
151 xpath: str,
152 namespaces: dict[str, str] | None,
153 elems_only: bool,
154 attrs_only: bool,
155 names: Sequence[str] | None,
156 dtype: DtypeArg | None,
157 converters: ConvertersArg | None,
158 parse_dates: ParseDatesArg | None,
159 encoding: str | None,
160 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
161 iterparse: dict[str, list[str]] | None,
162 compression: CompressionOptions,
163 storage_options: StorageOptions,
164 ) -> None:
165 self.path_or_buffer = path_or_buffer
166 self.xpath = xpath
167 self.namespaces = namespaces
168 self.elems_only = elems_only
169 self.attrs_only = attrs_only
170 self.names = names
171 self.dtype = dtype
172 self.converters = converters
173 self.parse_dates = parse_dates
174 self.encoding = encoding
175 self.stylesheet = stylesheet
176 self.iterparse = iterparse
177 self.is_style = None
178 self.compression = compression
179 self.storage_options = storage_options
181 def parse_data(self) -> list[dict[str, str | None]]:
182 """
183 Parse xml data.
185 This method will call the other internal methods to
186 validate xpath, names, parse and return specific nodes.
187 """
189 raise AbstractMethodError(self)
191 def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:
192 """
193 Parse xml nodes.
195 This method will parse the children and attributes of elements
196 in xpath, conditionally for only elements, only attributes
197 or both while optionally renaming node names.
199 Raises
200 ------
201 ValueError
202 * If only elements and only attributes are specified.
204 Notes
205 -----
206 Namespace URIs will be removed from return node values. Also,
207 elements with missing children or attributes compared to siblings
208 will have optional keys filled with None values.
209 """
211 dicts: list[dict[str, str | None]]
213 if self.elems_only and self.attrs_only:
214 raise ValueError("Either element or attributes can be parsed not both.")
215 elif self.elems_only:
216 if self.names:
217 dicts = [
218 {
219 **(
220 {el.tag: el.text.strip()}
221 if el.text and not el.text.isspace()
222 else {}
223 ),
224 **{
225 nm: ch.text.strip() if ch.text else None
226 for nm, ch in zip(self.names, el.findall("*"))
227 },
228 }
229 for el in elems
230 ]
231 else:
232 dicts = [
233 {
234 ch.tag: ch.text.strip() if ch.text else None
235 for ch in el.findall("*")
236 }
237 for el in elems
238 ]
240 elif self.attrs_only:
241 dicts = [
242 {k: v.strip() if v else None for k, v in el.attrib.items()}
243 for el in elems
244 ]
246 else:
247 if self.names:
248 dicts = [
249 {
250 **el.attrib,
251 **(
252 {el.tag: el.text.strip()}
253 if el.text and not el.text.isspace()
254 else {}
255 ),
256 **{
257 nm: ch.text.strip() if ch.text else None
258 for nm, ch in zip(self.names, el.findall("*"))
259 },
260 }
261 for el in elems
262 ]
264 else:
265 dicts = [
266 {
267 **el.attrib,
268 **(
269 {el.tag: el.text.strip()}
270 if el.text and not el.text.isspace()
271 else {}
272 ),
273 **{
274 ch.tag: ch.text.strip() if ch.text else None
275 for ch in el.findall("*")
276 },
277 }
278 for el in elems
279 ]
281 dicts = [
282 {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
283 ]
285 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
286 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
288 if self.names:
289 dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
291 return dicts
293 def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
294 """
295 Iterparse xml nodes.
297 This method will read in local disk, decompressed XML files for elements
298 and underlying descendants using iterparse, a method to iterate through
299 an XML tree without holding entire XML tree in memory.
301 Raises
302 ------
303 TypeError
304 * If `iterparse` is not a dict or its dict value is not list-like.
305 ParserError
306 * If `path_or_buffer` is not a physical, decompressed file on disk.
307 * If no data is returned from selected items in `iterparse`.
309 Notes
310 -----
311 Namespace URIs will be removed from return node values. Also,
312 elements with missing children or attributes in submitted list
313 will have optional keys filled with None values.
314 """
316 dicts: list[dict[str, str | None]] = []
317 row: dict[str, str | None] | None = None
319 if not isinstance(self.iterparse, dict):
320 raise TypeError(
321 f"{type(self.iterparse).__name__} is not a valid type for iterparse"
322 )
324 row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
325 if not is_list_like(self.iterparse[row_node]):
326 raise TypeError(
327 f"{type(self.iterparse[row_node])} is not a valid type "
328 "for value in iterparse"
329 )
331 if (
332 not isinstance(self.path_or_buffer, str)
333 or is_url(self.path_or_buffer)
334 or is_fsspec_url(self.path_or_buffer)
335 or self.path_or_buffer.startswith(("<?xml", "<"))
336 or infer_compression(self.path_or_buffer, "infer") is not None
337 ):
338 raise ParserError(
339 "iterparse is designed for large XML files that are fully extracted on "
340 "local disk and not as compressed files or online sources."
341 )
343 for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
344 curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
346 if event == "start":
347 if curr_elem == row_node:
348 row = {}
350 if row is not None:
351 if self.names:
352 for col, nm in zip(self.iterparse[row_node], self.names):
353 if curr_elem == col:
354 elem_val = elem.text.strip() if elem.text else None
355 if row.get(nm) != elem_val and nm not in row:
356 row[nm] = elem_val
357 if col in elem.attrib:
358 if elem.attrib[col] not in row.values() and nm not in row:
359 row[nm] = elem.attrib[col]
360 else:
361 for col in self.iterparse[row_node]:
362 if curr_elem == col:
363 row[col] = elem.text.strip() if elem.text else None
364 if col in elem.attrib:
365 row[col] = elem.attrib[col]
367 if event == "end":
368 if curr_elem == row_node and row is not None:
369 dicts.append(row)
370 row = None
372 elem.clear()
373 if hasattr(elem, "getprevious"):
374 while (
375 elem.getprevious() is not None and elem.getparent() is not None
376 ):
377 del elem.getparent()[0]
379 if dicts == []:
380 raise ParserError("No result from selected items in iterparse.")
382 keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
383 dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
385 if self.names:
386 dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]
388 return dicts
390 def _validate_path(self) -> None:
391 """
392 Validate xpath.
394 This method checks for syntax, evaluation, or empty nodes return.
396 Raises
397 ------
398 SyntaxError
399 * If xpah is not supported or issues with namespaces.
401 ValueError
402 * If xpah does not return any nodes.
403 """
405 raise AbstractMethodError(self)
407 def _validate_names(self) -> None:
408 """
409 Validate names.
411 This method will check if names is a list-like and aligns
412 with length of parse nodes.
414 Raises
415 ------
416 ValueError
417 * If value is not a list and less then length of nodes.
418 """
419 raise AbstractMethodError(self)
421 def _parse_doc(
422 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
423 ) -> Element | _Element:
424 """
425 Build tree from path_or_buffer.
427 This method will parse XML object into tree
428 either from string/bytes or file location.
429 """
430 raise AbstractMethodError(self)
433class _EtreeFrameParser(_XMLFrameParser):
434 """
435 Internal class to parse XML into DataFrames with the Python
436 standard library XML module: `xml.etree.ElementTree`.
437 """
439 def parse_data(self) -> list[dict[str, str | None]]:
440 from xml.etree.ElementTree import iterparse
442 if self.stylesheet is not None:
443 raise ValueError(
444 "To use stylesheet, you need lxml installed and selected as parser."
445 )
447 if self.iterparse is None:
448 self.xml_doc = self._parse_doc(self.path_or_buffer)
449 self._validate_path()
450 elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
452 self._validate_names()
454 xml_dicts: list[dict[str, str | None]] = (
455 self._parse_nodes(elems)
456 if self.iterparse is None
457 else self._iterparse_nodes(iterparse)
458 )
460 return xml_dicts
462 def _validate_path(self) -> None:
463 """
464 Notes
465 -----
466 `etree` supports limited XPath. If user attempts a more complex
467 expression syntax error will raise.
468 """
470 msg = (
471 "xpath does not return any nodes. "
472 "If document uses namespaces denoted with "
473 "xmlns, be sure to define namespaces and "
474 "use them in xpath."
475 )
476 try:
477 elems = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
478 if elems is None:
479 raise ValueError(msg)
481 if elems is not None and elems.find("*") is None and elems.attrib is None:
482 raise ValueError(msg)
484 except (KeyError, SyntaxError):
485 raise SyntaxError(
486 "You have used an incorrect or unsupported XPath "
487 "expression for etree library or you used an "
488 "undeclared namespace prefix."
489 )
491 def _validate_names(self) -> None:
492 children: list[Any]
494 if self.names:
495 if self.iterparse:
496 children = self.iterparse[next(iter(self.iterparse))]
497 else:
498 parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
499 children = parent.findall("*") if parent else []
501 if is_list_like(self.names):
502 if len(self.names) < len(children):
503 raise ValueError(
504 "names does not match length of child elements in xpath."
505 )
506 else:
507 raise TypeError(
508 f"{type(self.names).__name__} is not a valid type for names"
509 )
511 def _parse_doc(
512 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
513 ) -> Element:
514 from xml.etree.ElementTree import (
515 XMLParser,
516 parse,
517 )
519 handle_data = get_data_from_filepath(
520 filepath_or_buffer=raw_doc,
521 encoding=self.encoding,
522 compression=self.compression,
523 storage_options=self.storage_options,
524 )
526 with preprocess_data(handle_data) as xml_data:
527 curr_parser = XMLParser(encoding=self.encoding)
528 doc = parse(xml_data, parser=curr_parser)
530 return doc.getroot()
533class _LxmlFrameParser(_XMLFrameParser):
534 """
535 Internal class to parse XML into DataFrames with third-party
536 full-featured XML library, `lxml`, that supports
537 XPath 1.0 and XSLT 1.0.
538 """
540 def parse_data(self) -> list[dict[str, str | None]]:
541 """
542 Parse xml data.
544 This method will call the other internal methods to
545 validate xpath, names, optionally parse and run XSLT,
546 and parse original or transformed XML and return specific nodes.
547 """
548 from lxml.etree import iterparse
550 if self.iterparse is None:
551 self.xml_doc = self._parse_doc(self.path_or_buffer)
553 if self.stylesheet:
554 self.xsl_doc = self._parse_doc(self.stylesheet)
555 self.xml_doc = self._transform_doc()
557 self._validate_path()
558 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
560 self._validate_names()
562 xml_dicts: list[dict[str, str | None]] = (
563 self._parse_nodes(elems)
564 if self.iterparse is None
565 else self._iterparse_nodes(iterparse)
566 )
568 return xml_dicts
570 def _validate_path(self) -> None:
572 msg = (
573 "xpath does not return any nodes. "
574 "Be sure row level nodes are in xpath. "
575 "If document uses namespaces denoted with "
576 "xmlns, be sure to define namespaces and "
577 "use them in xpath."
578 )
580 elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
581 children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
582 attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)
584 if elems == []:
585 raise ValueError(msg)
587 if elems != [] and attrs == [] and children == []:
588 raise ValueError(msg)
590 def _validate_names(self) -> None:
591 children: list[Any]
593 if self.names:
594 if self.iterparse:
595 children = self.iterparse[next(iter(self.iterparse))]
596 else:
597 children = self.xml_doc.xpath(
598 self.xpath + "[1]/*", namespaces=self.namespaces
599 )
601 if is_list_like(self.names):
602 if len(self.names) < len(children):
603 raise ValueError(
604 "names does not match length of child elements in xpath."
605 )
606 else:
607 raise TypeError(
608 f"{type(self.names).__name__} is not a valid type for names"
609 )
611 def _parse_doc(
612 self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
613 ) -> _Element:
614 from lxml.etree import (
615 XMLParser,
616 fromstring,
617 parse,
618 )
620 handle_data = get_data_from_filepath(
621 filepath_or_buffer=raw_doc,
622 encoding=self.encoding,
623 compression=self.compression,
624 storage_options=self.storage_options,
625 )
627 with preprocess_data(handle_data) as xml_data:
628 curr_parser = XMLParser(encoding=self.encoding)
630 if isinstance(xml_data, io.StringIO):
631 if self.encoding is None:
632 raise TypeError(
633 "Can not pass encoding None when input is StringIO."
634 )
636 doc = fromstring(
637 xml_data.getvalue().encode(self.encoding), parser=curr_parser
638 )
639 else:
640 doc = parse(xml_data, parser=curr_parser)
642 return doc
644 def _transform_doc(self) -> _XSLTResultTree:
645 """
646 Transform original tree using stylesheet.
648 This method will transform original xml using XSLT script into
649 am ideally flatter xml document for easier parsing and migration
650 to Data Frame.
651 """
652 from lxml.etree import XSLT
654 transformer = XSLT(self.xsl_doc)
655 new_doc = transformer(self.xml_doc)
657 return new_doc
660def get_data_from_filepath(
661 filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
662 encoding: str | None,
663 compression: CompressionOptions,
664 storage_options: StorageOptions,
665) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
666 """
667 Extract raw XML data.
669 The method accepts three input types:
670 1. filepath (string-like)
671 2. file-like object (e.g. open file object, StringIO)
672 3. XML string or bytes
674 This method turns (1) into (2) to simplify the rest of the processing.
675 It returns input types (2) and (3) unchanged.
676 """
677 if not isinstance(filepath_or_buffer, bytes):
678 filepath_or_buffer = stringify_path(filepath_or_buffer)
680 if (
681 isinstance(filepath_or_buffer, str)
682 and not filepath_or_buffer.startswith(("<?xml", "<"))
683 ) and (
684 not isinstance(filepath_or_buffer, str)
685 or is_url(filepath_or_buffer)
686 or is_fsspec_url(filepath_or_buffer)
687 or file_exists(filepath_or_buffer)
688 ):
689 with get_handle(
690 filepath_or_buffer,
691 "r",
692 encoding=encoding,
693 compression=compression,
694 storage_options=storage_options,
695 ) as handle_obj:
696 filepath_or_buffer = (
697 # error: Incompatible types in assignment (expression has type
698 # "Union[str, IO[str]]", variable has type "Union[Union[str,
699 # PathLike[str]], bytes, ReadBuffer[bytes], ReadBuffer[str]]")
700 handle_obj.handle.read() # type: ignore[assignment]
701 if hasattr(handle_obj.handle, "read")
702 else handle_obj.handle
703 )
705 return filepath_or_buffer
708def preprocess_data(data) -> io.StringIO | io.BytesIO:
709 """
710 Convert extracted raw data.
712 This method will return underlying data of extracted XML content.
713 The data either has a `read` attribute (e.g. a file object or a
714 StringIO/BytesIO) or is a string or bytes that is an XML document.
715 """
717 if isinstance(data, str):
718 data = io.StringIO(data)
720 elif isinstance(data, bytes):
721 data = io.BytesIO(data)
723 return data
726def _data_to_frame(data, **kwargs) -> DataFrame:
727 """
728 Convert parsed data to Data Frame.
730 This method will bind xml dictionary data of keys and values
731 into named columns of Data Frame using the built-in TextParser
732 class that build Data Frame and infers specific dtypes.
733 """
735 tags = next(iter(data))
736 nodes = [list(d.values()) for d in data]
738 try:
739 with TextParser(nodes, names=tags, **kwargs) as tp:
740 return tp.read()
741 except ParserError:
742 raise ParserError(
743 "XML document may be too complex for import. "
744 "Try to flatten document and use distinct "
745 "element and attribute names."
746 )
749def _parse(
750 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
751 xpath: str,
752 namespaces: dict[str, str] | None,
753 elems_only: bool,
754 attrs_only: bool,
755 names: Sequence[str] | None,
756 dtype: DtypeArg | None,
757 converters: ConvertersArg | None,
758 parse_dates: ParseDatesArg | None,
759 encoding: str | None,
760 parser: XMLParsers,
761 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
762 iterparse: dict[str, list[str]] | None,
763 compression: CompressionOptions,
764 storage_options: StorageOptions,
765 **kwargs,
766) -> DataFrame:
767 """
768 Call internal parsers.
770 This method will conditionally call internal parsers:
771 LxmlFrameParser and/or EtreeParser.
773 Raises
774 ------
775 ImportError
776 * If lxml is not installed if selected as parser.
778 ValueError
779 * If parser is not lxml or etree.
780 """
782 p: _EtreeFrameParser | _LxmlFrameParser
784 if parser == "lxml":
785 lxml = import_optional_dependency("lxml.etree", errors="ignore")
787 if lxml is not None:
788 p = _LxmlFrameParser(
789 path_or_buffer,
790 xpath,
791 namespaces,
792 elems_only,
793 attrs_only,
794 names,
795 dtype,
796 converters,
797 parse_dates,
798 encoding,
799 stylesheet,
800 iterparse,
801 compression,
802 storage_options,
803 )
804 else:
805 raise ImportError("lxml not found, please install or use the etree parser.")
807 elif parser == "etree":
808 p = _EtreeFrameParser(
809 path_or_buffer,
810 xpath,
811 namespaces,
812 elems_only,
813 attrs_only,
814 names,
815 dtype,
816 converters,
817 parse_dates,
818 encoding,
819 stylesheet,
820 iterparse,
821 compression,
822 storage_options,
823 )
824 else:
825 raise ValueError("Values for parser can only be lxml or etree.")
827 data_dicts = p.parse_data()
829 return _data_to_frame(
830 data=data_dicts,
831 dtype=dtype,
832 converters=converters,
833 parse_dates=parse_dates,
834 **kwargs,
835 )
838@deprecate_nonkeyword_arguments(version=None, allowed_args=["path_or_buffer"])
839@doc(
840 storage_options=_shared_docs["storage_options"],
841 decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
842)
843def read_xml(
844 path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
845 xpath: str = "./*",
846 namespaces: dict[str, str] | None = None,
847 elems_only: bool = False,
848 attrs_only: bool = False,
849 names: Sequence[str] | None = None,
850 dtype: DtypeArg | None = None,
851 converters: ConvertersArg | None = None,
852 parse_dates: ParseDatesArg | None = None,
853 # encoding can not be None for lxml and StringIO input
854 encoding: str | None = "utf-8",
855 parser: XMLParsers = "lxml",
856 stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
857 iterparse: dict[str, list[str]] | None = None,
858 compression: CompressionOptions = "infer",
859 storage_options: StorageOptions = None,
860) -> DataFrame:
861 r"""
862 Read XML document into a ``DataFrame`` object.
864 .. versionadded:: 1.3.0
866 Parameters
867 ----------
868 path_or_buffer : str, path object, or file-like object
869 String, path object (implementing ``os.PathLike[str]``), or file-like
870 object implementing a ``read()`` function. The string can be any valid XML
871 string or a path. The string can further be a URL. Valid URL schemes
872 include http, ftp, s3, and file.
874 xpath : str, optional, default './\*'
875 The XPath to parse required set of nodes for migration to DataFrame.
876 XPath should return a collection of elements and not a single
877 element. Note: The ``etree`` parser supports limited XPath
878 expressions. For more complex XPath, use ``lxml`` which requires
879 installation.
881 namespaces : dict, optional
882 The namespaces defined in XML document as dicts with key being
883 namespace prefix and value the URI. There is no need to include all
884 namespaces in XML, only the ones used in ``xpath`` expression.
885 Note: if XML document uses default namespace denoted as
886 `xmlns='<URI>'` without a prefix, you must assign any temporary
887 namespace prefix such as 'doc' to the URI in order to parse
888 underlying nodes and/or attributes. For example, ::
890 namespaces = {{"doc": "https://example.com"}}
892 elems_only : bool, optional, default False
893 Parse only the child elements at the specified ``xpath``. By default,
894 all child elements and non-empty text nodes are returned.
896 attrs_only : bool, optional, default False
897 Parse only the attributes at the specified ``xpath``.
898 By default, all attributes are returned.
900 names : list-like, optional
901 Column names for DataFrame of parsed XML data. Use this parameter to
902 rename original element names and distinguish same named elements and
903 attributes.
905 dtype : Type name or dict of column -> type, optional
906 Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
907 'c': 'Int64'}}
908 Use `str` or `object` together with suitable `na_values` settings
909 to preserve and not interpret dtype.
910 If converters are specified, they will be applied INSTEAD
911 of dtype conversion.
913 .. versionadded:: 1.5.0
915 converters : dict, optional
916 Dict of functions for converting values in certain columns. Keys can either
917 be integers or column labels.
919 .. versionadded:: 1.5.0
921 parse_dates : bool or list of int or names or list of lists or dict, default False
922 Identifiers to parse index or columns to datetime. The behavior is as follows:
924 * boolean. If True -> try parsing the index.
925 * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
926 each as a separate date column.
927 * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
928 a single date column.
929 * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
930 result 'foo'
932 .. versionadded:: 1.5.0
934 encoding : str, optional, default 'utf-8'
935 Encoding of XML document.
937 parser : {{'lxml','etree'}}, default 'lxml'
938 Parser module to use for retrieval of data. Only 'lxml' and
939 'etree' are supported. With 'lxml' more complex XPath searches
940 and ability to use XSLT stylesheet are supported.
942 stylesheet : str, path object or file-like object
943 A URL, file-like object, or a raw string containing an XSLT script.
944 This stylesheet should flatten complex, deeply nested XML documents
945 for easier parsing. To use this feature you must have ``lxml`` module
946 installed and specify 'lxml' as ``parser``. The ``xpath`` must
947 reference nodes of transformed XML document generated after XSLT
948 transformation and not the original XML document. Only XSLT 1.0
949 scripts and not later versions is currently supported.
951 iterparse : dict, optional
952 The nodes or attributes to retrieve in iterparsing of XML document
953 as a dict with key being the name of repeating element and value being
954 list of elements or attribute names that are descendants of the repeated
955 element. Note: If this option is used, it will replace ``xpath`` parsing
956 and unlike xpath, descendants do not need to relate to each other but can
957 exist any where in document under the repeating element. This memory-
958 efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).
959 For example, ::
961 iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}}
963 .. versionadded:: 1.5.0
965 {decompression_options}
967 .. versionchanged:: 1.4.0 Zstandard support.
969 {storage_options}
971 Returns
972 -------
973 df
974 A DataFrame.
976 See Also
977 --------
978 read_json : Convert a JSON string to pandas object.
979 read_html : Read HTML tables into a list of DataFrame objects.
981 Notes
982 -----
983 This method is best designed to import shallow XML documents in
984 following format which is the ideal fit for the two-dimensions of a
985 ``DataFrame`` (row by column). ::
987 <root>
988 <row>
989 <column1>data</column1>
990 <column2>data</column2>
991 <column3>data</column3>
992 ...
993 </row>
994 <row>
995 ...
996 </row>
997 ...
998 </root>
1000 As a file format, XML documents can be designed any way including
1001 layout of elements and attributes as long as it conforms to W3C
1002 specifications. Therefore, this method is a convenience handler for
1003 a specific flatter design and not all possible XML structures.
1005 However, for more complex XML documents, ``stylesheet`` allows you to
1006 temporarily redesign original document with XSLT (a special purpose
1007 language) for a flatter version for migration to a DataFrame.
1009 This function will *always* return a single :class:`DataFrame` or raise
1010 exceptions due to issues with XML document, ``xpath``, or other
1011 parameters.
1013 See the :ref:`read_xml documentation in the IO section of the docs
1014 <io.read_xml>` for more information in using this method to parse XML
1015 files to DataFrames.
1017 Examples
1018 --------
1019 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
1020 ... <data xmlns="http://example.com">
1021 ... <row>
1022 ... <shape>square</shape>
1023 ... <degrees>360</degrees>
1024 ... <sides>4.0</sides>
1025 ... </row>
1026 ... <row>
1027 ... <shape>circle</shape>
1028 ... <degrees>360</degrees>
1029 ... <sides/>
1030 ... </row>
1031 ... <row>
1032 ... <shape>triangle</shape>
1033 ... <degrees>180</degrees>
1034 ... <sides>3.0</sides>
1035 ... </row>
1036 ... </data>'''
1038 >>> df = pd.read_xml(xml)
1039 >>> df
1040 shape degrees sides
1041 0 square 360 4.0
1042 1 circle 360 NaN
1043 2 triangle 180 3.0
1045 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
1046 ... <data>
1047 ... <row shape="square" degrees="360" sides="4.0"/>
1048 ... <row shape="circle" degrees="360"/>
1049 ... <row shape="triangle" degrees="180" sides="3.0"/>
1050 ... </data>'''
1052 >>> df = pd.read_xml(xml, xpath=".//row")
1053 >>> df
1054 shape degrees sides
1055 0 square 360 4.0
1056 1 circle 360 NaN
1057 2 triangle 180 3.0
1059 >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
1060 ... <doc:data xmlns:doc="https://example.com">
1061 ... <doc:row>
1062 ... <doc:shape>square</doc:shape>
1063 ... <doc:degrees>360</doc:degrees>
1064 ... <doc:sides>4.0</doc:sides>
1065 ... </doc:row>
1066 ... <doc:row>
1067 ... <doc:shape>circle</doc:shape>
1068 ... <doc:degrees>360</doc:degrees>
1069 ... <doc:sides/>
1070 ... </doc:row>
1071 ... <doc:row>
1072 ... <doc:shape>triangle</doc:shape>
1073 ... <doc:degrees>180</doc:degrees>
1074 ... <doc:sides>3.0</doc:sides>
1075 ... </doc:row>
1076 ... </doc:data>'''
1078 >>> df = pd.read_xml(xml,
1079 ... xpath="//doc:row",
1080 ... namespaces={{"doc": "https://example.com"}})
1081 >>> df
1082 shape degrees sides
1083 0 square 360 4.0
1084 1 circle 360 NaN
1085 2 triangle 180 3.0
1086 """
1088 return _parse(
1089 path_or_buffer=path_or_buffer,
1090 xpath=xpath,
1091 namespaces=namespaces,
1092 elems_only=elems_only,
1093 attrs_only=attrs_only,
1094 names=names,
1095 dtype=dtype,
1096 converters=converters,
1097 parse_dates=parse_dates,
1098 encoding=encoding,
1099 parser=parser,
1100 stylesheet=stylesheet,
1101 iterparse=iterparse,
1102 compression=compression,
1103 storage_options=storage_options,
1104 )