Coverage for /var/srv/projects/api.amasfac.comuna18.com/tmp/venv/lib/python3.9/site-packages/pandas/io/stata.py: 10%

1557 statements  

« prev     ^ index     » next       coverage.py v6.4.4, created at 2023-07-17 14:22 -0600

1""" 

2Module contains tools for processing Stata files into DataFrames 

3 

4The StataReader below was originally written by Joe Presbrey as part of PyDTA. 

5It has been extended and improved by Skipper Seabold from the Statsmodels 

6project who also developed the StataWriter and was finally added to pandas in 

7a once again improved version. 

8 

9You can find more information on http://presbrey.mit.edu/PyDTA and 

10https://www.statsmodels.org/devel/ 

11""" 

12from __future__ import annotations 

13 

14from collections import abc 

15import datetime 

16from io import BytesIO 

17import os 

18import struct 

19import sys 

20from typing import ( 

21 IO, 

22 TYPE_CHECKING, 

23 Any, 

24 AnyStr, 

25 Final, 

26 Hashable, 

27 Sequence, 

28 cast, 

29) 

30import warnings 

31 

32from dateutil.relativedelta import relativedelta 

33import numpy as np 

34 

35from pandas._libs.lib import infer_dtype 

36from pandas._libs.writers import max_len_string_array 

37from pandas._typing import ( 

38 CompressionOptions, 

39 FilePath, 

40 ReadBuffer, 

41 StorageOptions, 

42 WriteBuffer, 

43) 

44from pandas.errors import ( 

45 CategoricalConversionWarning, 

46 InvalidColumnName, 

47 PossiblePrecisionLoss, 

48 ValueLabelTypeMismatch, 

49) 

50from pandas.util._decorators import ( 

51 Appender, 

52 deprecate_nonkeyword_arguments, 

53 doc, 

54) 

55from pandas.util._exceptions import find_stack_level 

56 

57from pandas.core.dtypes.common import ( 

58 ensure_object, 

59 is_categorical_dtype, 

60 is_datetime64_dtype, 

61 is_numeric_dtype, 

62) 

63 

64from pandas import ( 

65 Categorical, 

66 DatetimeIndex, 

67 NaT, 

68 Timestamp, 

69 isna, 

70 to_datetime, 

71 to_timedelta, 

72) 

73from pandas.core.arrays.boolean import BooleanDtype 

74from pandas.core.arrays.integer import IntegerDtype 

75from pandas.core.frame import DataFrame 

76from pandas.core.indexes.base import Index 

77from pandas.core.series import Series 

78from pandas.core.shared_docs import _shared_docs 

79 

80from pandas.io.common import get_handle 

81 

82if TYPE_CHECKING: 82 ↛ 83line 82 didn't jump to line 83, because the condition on line 82 was never true

83 from typing import Literal 

84 

85_version_error = ( 

86 "Version of given Stata file is {version}. pandas supports importing " 

87 "versions 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " 

88 "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," 

89 "and 119 (Stata 15/16, over 32,767 variables)." 

90) 

91 

92_statafile_processing_params1 = """\ 

93convert_dates : bool, default True 

94 Convert date variables to DataFrame time values. 

95convert_categoricals : bool, default True 

96 Read value labels and convert columns to Categorical/Factor variables.""" 

97 

98_statafile_processing_params2 = """\ 

99index_col : str, optional 

100 Column to set as index. 

101convert_missing : bool, default False 

102 Flag indicating whether to convert missing values to their Stata 

103 representations. If False, missing values are replaced with nan. 

104 If True, columns containing missing values are returned with 

105 object data types and missing values are represented by 

106 StataMissingValue objects. 

107preserve_dtypes : bool, default True 

108 Preserve Stata datatypes. If False, numeric data are upcast to pandas 

109 default types for foreign data (float64 or int64). 

110columns : list or None 

111 Columns to retain. Columns will be returned in the given order. None 

112 returns all columns. 

113order_categoricals : bool, default True 

114 Flag indicating whether converted categorical data are ordered.""" 

115 

116_chunksize_params = """\ 

117chunksize : int, default None 

118 Return StataReader object for iterations, returns chunks with 

119 given number of lines.""" 

120 

121_iterator_params = """\ 

122iterator : bool, default False 

123 Return StataReader object.""" 

124 

125_reader_notes = """\ 

126Notes 

127----- 

128Categorical variables read through an iterator may not have the same 

129categories and dtype. This occurs when a variable stored in a DTA 

130file is associated to an incomplete set of value labels that only 

131label a strict subset of the values.""" 

132 

133_read_stata_doc = f""" 

134Read Stata file into DataFrame. 

135 

136Parameters 

137---------- 

138filepath_or_buffer : str, path object or file-like object 

139 Any valid string path is acceptable. The string could be a URL. Valid 

140 URL schemes include http, ftp, s3, and file. For file URLs, a host is 

141 expected. A local file could be: ``file://localhost/path/to/table.dta``. 

142 

143 If you want to pass in a path object, pandas accepts any ``os.PathLike``. 

144 

145 By file-like object, we refer to objects with a ``read()`` method, 

146 such as a file handle (e.g. via builtin ``open`` function) 

147 or ``StringIO``. 

148{_statafile_processing_params1} 

149{_statafile_processing_params2} 

150{_chunksize_params} 

151{_iterator_params} 

152{_shared_docs["decompression_options"] % "filepath_or_buffer"} 

153{_shared_docs["storage_options"]} 

154 

155Returns 

156------- 

157DataFrame or StataReader 

158 

159See Also 

160-------- 

161io.stata.StataReader : Low-level reader for Stata data files. 

162DataFrame.to_stata: Export Stata data files. 

163 

164{_reader_notes} 

165 

166Examples 

167-------- 

168 

169Creating a dummy stata for this example 

170>>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 

171... 'parrot'], 

172... 'speed': [350, 18, 361, 15]}}) # doctest: +SKIP 

173>>> df.to_stata('animals.dta') # doctest: +SKIP 

174 

175Read a Stata dta file: 

176 

177>>> df = pd.read_stata('animals.dta') # doctest: +SKIP 

178 

179Read a Stata dta file in 10,000 line chunks: 

180>>> values = np.random.randint(0, 10, size=(20_000, 1), dtype="uint8") # doctest: +SKIP 

181>>> df = pd.DataFrame(values, columns=["i"]) # doctest: +SKIP 

182>>> df.to_stata('filename.dta') # doctest: +SKIP 

183 

184>>> itr = pd.read_stata('filename.dta', chunksize=10000) # doctest: +SKIP 

185>>> for chunk in itr: 

186... # Operate on a single chunk, e.g., chunk.mean() 

187... pass # doctest: +SKIP 

188""" 

189 

190_read_method_doc = f"""\ 

191Reads observations from Stata file, converting them into a dataframe 

192 

193Parameters 

194---------- 

195nrows : int 

196 Number of lines to read from data file, if None read whole file. 

197{_statafile_processing_params1} 

198{_statafile_processing_params2} 

199 

200Returns 

201------- 

202DataFrame 

203""" 

204 

205_stata_reader_doc = f"""\ 

206Class for reading Stata dta files. 

207 

208Parameters 

209---------- 

210path_or_buf : path (string), buffer or path object 

211 string, path object (pathlib.Path or py._path.local.LocalPath) or object 

212 implementing a binary read() functions. 

213{_statafile_processing_params1} 

214{_statafile_processing_params2} 

215{_chunksize_params} 

216{_shared_docs["decompression_options"]} 

217{_shared_docs["storage_options"]} 

218 

219{_reader_notes} 

220""" 

221 

222 

223_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] 

224 

225 

226stata_epoch: Final = datetime.datetime(1960, 1, 1) 

227 

228 

229# TODO: Add typing. As of January 2020 it is not possible to type this function since 

230# mypy doesn't understand that a Series and an int can be combined using mathematical 

231# operations. (+, -). 

232def _stata_elapsed_date_to_datetime_vec(dates, fmt) -> Series: 

233 """ 

234 Convert from SIF to datetime. https://www.stata.com/help.cgi?datetime 

235 

236 Parameters 

237 ---------- 

238 dates : Series 

239 The Stata Internal Format date to convert to datetime according to fmt 

240 fmt : str 

241 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty 

242 Returns 

243 

244 Returns 

245 ------- 

246 converted : Series 

247 The converted dates 

248 

249 Examples 

250 -------- 

251 >>> dates = pd.Series([52]) 

252 >>> _stata_elapsed_date_to_datetime_vec(dates , "%tw") 

253 0 1961-01-01 

254 dtype: datetime64[ns] 

255 

256 Notes 

257 ----- 

258 datetime/c - tc 

259 milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day 

260 datetime/C - tC - NOT IMPLEMENTED 

261 milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds 

262 date - td 

263 days since 01jan1960 (01jan1960 = 0) 

264 weekly date - tw 

265 weeks since 1960w1 

266 This assumes 52 weeks in a year, then adds 7 * remainder of the weeks. 

267 The datetime value is the start of the week in terms of days in the 

268 year, not ISO calendar weeks. 

269 monthly date - tm 

270 months since 1960m1 

271 quarterly date - tq 

272 quarters since 1960q1 

273 half-yearly date - th 

274 half-years since 1960h1 yearly 

275 date - ty 

276 years since 0000 

277 """ 

278 MIN_YEAR, MAX_YEAR = Timestamp.min.year, Timestamp.max.year 

279 MAX_DAY_DELTA = (Timestamp.max - datetime.datetime(1960, 1, 1)).days 

280 MIN_DAY_DELTA = (Timestamp.min - datetime.datetime(1960, 1, 1)).days 

281 MIN_MS_DELTA = MIN_DAY_DELTA * 24 * 3600 * 1000 

282 MAX_MS_DELTA = MAX_DAY_DELTA * 24 * 3600 * 1000 

283 

284 def convert_year_month_safe(year, month) -> Series: 

285 """ 

286 Convert year and month to datetimes, using pandas vectorized versions 

287 when the date range falls within the range supported by pandas. 

288 Otherwise it falls back to a slower but more robust method 

289 using datetime. 

290 """ 

291 if year.max() < MAX_YEAR and year.min() > MIN_YEAR: 

292 return to_datetime(100 * year + month, format="%Y%m") 

293 else: 

294 index = getattr(year, "index", None) 

295 return Series( 

296 [datetime.datetime(y, m, 1) for y, m in zip(year, month)], index=index 

297 ) 

298 

299 def convert_year_days_safe(year, days) -> Series: 

300 """ 

301 Converts year (e.g. 1999) and days since the start of the year to a 

302 datetime or datetime64 Series 

303 """ 

304 if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: 

305 return to_datetime(year, format="%Y") + to_timedelta(days, unit="d") 

306 else: 

307 index = getattr(year, "index", None) 

308 value = [ 

309 datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) 

310 for y, d in zip(year, days) 

311 ] 

312 return Series(value, index=index) 

313 

314 def convert_delta_safe(base, deltas, unit) -> Series: 

315 """ 

316 Convert base dates and deltas to datetimes, using pandas vectorized 

317 versions if the deltas satisfy restrictions required to be expressed 

318 as dates in pandas. 

319 """ 

320 index = getattr(deltas, "index", None) 

321 if unit == "d": 

322 if deltas.max() > MAX_DAY_DELTA or deltas.min() < MIN_DAY_DELTA: 

323 values = [base + relativedelta(days=int(d)) for d in deltas] 

324 return Series(values, index=index) 

325 elif unit == "ms": 

326 if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: 

327 values = [ 

328 base + relativedelta(microseconds=(int(d) * 1000)) for d in deltas 

329 ] 

330 return Series(values, index=index) 

331 else: 

332 raise ValueError("format not understood") 

333 base = to_datetime(base) 

334 deltas = to_timedelta(deltas, unit=unit) 

335 return base + deltas 

336 

337 # TODO(non-nano): If/when pandas supports more than datetime64[ns], this 

338 # should be improved to use correct range, e.g. datetime[Y] for yearly 

339 bad_locs = np.isnan(dates) 

340 has_bad_values = False 

341 if bad_locs.any(): 

342 has_bad_values = True 

343 # reset cache to avoid SettingWithCopy checks (we own the DataFrame and the 

344 # `dates` Series is used to overwrite itself in the DataFramae) 

345 dates._reset_cacher() 

346 dates[bad_locs] = 1.0 # Replace with NaT 

347 dates = dates.astype(np.int64) 

348 

349 if fmt.startswith(("%tc", "tc")): # Delta ms relative to base 

350 base = stata_epoch 

351 ms = dates 

352 conv_dates = convert_delta_safe(base, ms, "ms") 

353 elif fmt.startswith(("%tC", "tC")): 

354 

355 warnings.warn( 

356 "Encountered %tC format. Leaving in Stata Internal Format.", 

357 stacklevel=find_stack_level(), 

358 ) 

359 conv_dates = Series(dates, dtype=object) 

360 if has_bad_values: 

361 conv_dates[bad_locs] = NaT 

362 return conv_dates 

363 # Delta days relative to base 

364 elif fmt.startswith(("%td", "td", "%d", "d")): 

365 base = stata_epoch 

366 days = dates 

367 conv_dates = convert_delta_safe(base, days, "d") 

368 # does not count leap days - 7 days is a week. 

369 # 52nd week may have more than 7 days 

370 elif fmt.startswith(("%tw", "tw")): 

371 year = stata_epoch.year + dates // 52 

372 days = (dates % 52) * 7 

373 conv_dates = convert_year_days_safe(year, days) 

374 elif fmt.startswith(("%tm", "tm")): # Delta months relative to base 

375 year = stata_epoch.year + dates // 12 

376 month = (dates % 12) + 1 

377 conv_dates = convert_year_month_safe(year, month) 

378 elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base 

379 year = stata_epoch.year + dates // 4 

380 quarter_month = (dates % 4) * 3 + 1 

381 conv_dates = convert_year_month_safe(year, quarter_month) 

382 elif fmt.startswith(("%th", "th")): # Delta half-years relative to base 

383 year = stata_epoch.year + dates // 2 

384 month = (dates % 2) * 6 + 1 

385 conv_dates = convert_year_month_safe(year, month) 

386 elif fmt.startswith(("%ty", "ty")): # Years -- not delta 

387 year = dates 

388 first_month = np.ones_like(dates) 

389 conv_dates = convert_year_month_safe(year, first_month) 

390 else: 

391 raise ValueError(f"Date fmt {fmt} not understood") 

392 

393 if has_bad_values: # Restore NaT for bad values 

394 conv_dates[bad_locs] = NaT 

395 

396 return conv_dates 

397 

398 

399def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series: 

400 """ 

401 Convert from datetime to SIF. https://www.stata.com/help.cgi?datetime 

402 

403 Parameters 

404 ---------- 

405 dates : Series 

406 Series or array containing datetime.datetime or datetime64[ns] to 

407 convert to the Stata Internal Format given by fmt 

408 fmt : str 

409 The format to convert to. Can be, tc, td, tw, tm, tq, th, ty 

410 """ 

411 index = dates.index 

412 NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000 

413 US_PER_DAY = NS_PER_DAY / 1000 

414 

415 def parse_dates_safe(dates, delta=False, year=False, days=False): 

416 d = {} 

417 if is_datetime64_dtype(dates.dtype): 

418 if delta: 

419 time_delta = dates - stata_epoch 

420 d["delta"] = time_delta._values.view(np.int64) // 1000 # microseconds 

421 if days or year: 

422 date_index = DatetimeIndex(dates) 

423 d["year"] = date_index._data.year 

424 d["month"] = date_index._data.month 

425 if days: 

426 days_in_ns = dates.view(np.int64) - to_datetime( 

427 d["year"], format="%Y" 

428 ).view(np.int64) 

429 d["days"] = days_in_ns // NS_PER_DAY 

430 

431 elif infer_dtype(dates, skipna=False) == "datetime": 

432 if delta: 

433 delta = dates._values - stata_epoch 

434 

435 def f(x: datetime.timedelta) -> float: 

436 return US_PER_DAY * x.days + 1000000 * x.seconds + x.microseconds 

437 

438 v = np.vectorize(f) 

439 d["delta"] = v(delta) 

440 if year: 

441 year_month = dates.apply(lambda x: 100 * x.year + x.month) 

442 d["year"] = year_month._values // 100 

443 d["month"] = year_month._values - d["year"] * 100 

444 if days: 

445 

446 def g(x: datetime.datetime) -> int: 

447 return (x - datetime.datetime(x.year, 1, 1)).days 

448 

449 v = np.vectorize(g) 

450 d["days"] = v(dates) 

451 else: 

452 raise ValueError( 

453 "Columns containing dates must contain either " 

454 "datetime64, datetime.datetime or null values." 

455 ) 

456 

457 return DataFrame(d, index=index) 

458 

459 bad_loc = isna(dates) 

460 index = dates.index 

461 if bad_loc.any(): 

462 dates = Series(dates) 

463 if is_datetime64_dtype(dates): 

464 dates[bad_loc] = to_datetime(stata_epoch) 

465 else: 

466 dates[bad_loc] = stata_epoch 

467 

468 if fmt in ["%tc", "tc"]: 

469 d = parse_dates_safe(dates, delta=True) 

470 conv_dates = d.delta / 1000 

471 elif fmt in ["%tC", "tC"]: 

472 warnings.warn( 

473 "Stata Internal Format tC not supported.", 

474 stacklevel=find_stack_level(), 

475 ) 

476 conv_dates = dates 

477 elif fmt in ["%td", "td"]: 

478 d = parse_dates_safe(dates, delta=True) 

479 conv_dates = d.delta // US_PER_DAY 

480 elif fmt in ["%tw", "tw"]: 

481 d = parse_dates_safe(dates, year=True, days=True) 

482 conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7 

483 elif fmt in ["%tm", "tm"]: 

484 d = parse_dates_safe(dates, year=True) 

485 conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1 

486 elif fmt in ["%tq", "tq"]: 

487 d = parse_dates_safe(dates, year=True) 

488 conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3 

489 elif fmt in ["%th", "th"]: 

490 d = parse_dates_safe(dates, year=True) 

491 conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(int) 

492 elif fmt in ["%ty", "ty"]: 

493 d = parse_dates_safe(dates, year=True) 

494 conv_dates = d.year 

495 else: 

496 raise ValueError(f"Format {fmt} is not a known Stata date format") 

497 

498 conv_dates = Series(conv_dates, dtype=np.float64) 

499 missing_value = struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0] 

500 conv_dates[bad_loc] = missing_value 

501 

502 return Series(conv_dates, index=index) 

503 

504 

505excessive_string_length_error: Final = """ 

506Fixed width strings in Stata .dta files are limited to 244 (or fewer) 

507characters. Column '{0}' does not satisfy this restriction. Use the 

508'version=117' parameter to write the newer (Stata 13 and later) format. 

509""" 

510 

511 

512precision_loss_doc: Final = """ 

513Column converted from {0} to {1}, and some data are outside of the lossless 

514conversion range. This may result in a loss of precision in the saved data. 

515""" 

516 

517 

518value_label_mismatch_doc: Final = """ 

519Stata value labels (pandas categories) must be strings. Column {0} contains 

520non-string labels which will be converted to strings. Please check that the 

521Stata data file created has not lost information due to duplicate labels. 

522""" 

523 

524 

525invalid_name_doc: Final = """ 

526Not all pandas column names were valid Stata variable names. 

527The following replacements have been made: 

528 

529 {0} 

530 

531If this is not what you expect, please make sure you have Stata-compliant 

532column names in your DataFrame (strings only, max 32 characters, only 

533alphanumerics and underscores, no Stata reserved words) 

534""" 

535 

536 

537categorical_conversion_warning: Final = """ 

538One or more series with value labels are not fully labeled. Reading this 

539dataset with an iterator results in categorical variable with different 

540categories. This occurs since it is not possible to know all possible values 

541until the entire dataset has been read. To avoid this warning, you can either 

542read dataset without an iterator, or manually convert categorical data by 

543``convert_categoricals`` to False and then accessing the variable labels 

544through the value_labels method of the reader. 

545""" 

546 

547 

548def _cast_to_stata_types(data: DataFrame) -> DataFrame: 

549 """ 

550 Checks the dtypes of the columns of a pandas DataFrame for 

551 compatibility with the data types and ranges supported by Stata, and 

552 converts if necessary. 

553 

554 Parameters 

555 ---------- 

556 data : DataFrame 

557 The DataFrame to check and convert 

558 

559 Notes 

560 ----- 

561 Numeric columns in Stata must be one of int8, int16, int32, float32 or 

562 float64, with some additional value restrictions. int8 and int16 columns 

563 are checked for violations of the value restrictions and upcast if needed. 

564 int64 data is not usable in Stata, and so it is downcast to int32 whenever 

565 the value are in the int32 range, and sidecast to float64 when larger than 

566 this range. If the int64 values are outside of the range of those 

567 perfectly representable as float64 values, a warning is raised. 

568 

569 bool columns are cast to int8. uint columns are converted to int of the 

570 same size if there is no loss in precision, otherwise are upcast to a 

571 larger type. uint64 is currently not supported since it is concerted to 

572 object in a DataFrame. 

573 """ 

574 ws = "" 

575 # original, if small, if large 

576 conversion_data: tuple[ 

577 tuple[type, type, type], 

578 tuple[type, type, type], 

579 tuple[type, type, type], 

580 tuple[type, type, type], 

581 tuple[type, type, type], 

582 ] = ( 

583 (np.bool_, np.int8, np.int8), 

584 (np.uint8, np.int8, np.int16), 

585 (np.uint16, np.int16, np.int32), 

586 (np.uint32, np.int32, np.int64), 

587 (np.uint64, np.int64, np.float64), 

588 ) 

589 

590 float32_max = struct.unpack("<f", b"\xff\xff\xff\x7e")[0] 

591 float64_max = struct.unpack("<d", b"\xff\xff\xff\xff\xff\xff\xdf\x7f")[0] 

592 

593 for col in data: 

594 # Cast from unsupported types to supported types 

595 is_nullable_int = isinstance(data[col].dtype, (IntegerDtype, BooleanDtype)) 

596 orig = data[col] 

597 # We need to find orig_missing before altering data below 

598 orig_missing = orig.isna() 

599 if is_nullable_int: 

600 missing_loc = data[col].isna() 

601 if missing_loc.any(): 

602 # Replace with always safe value 

603 fv = 0 if isinstance(data[col].dtype, IntegerDtype) else False 

604 data.loc[missing_loc, col] = fv 

605 # Replace with NumPy-compatible column 

606 data[col] = data[col].astype(data[col].dtype.numpy_dtype) 

607 dtype = data[col].dtype 

608 for c_data in conversion_data: 

609 if dtype == c_data[0]: 

610 if data[col].max() <= np.iinfo(c_data[1]).max: 

611 dtype = c_data[1] 

612 else: 

613 dtype = c_data[2] 

614 if c_data[2] == np.int64: # Warn if necessary 

615 if data[col].max() >= 2**53: 

616 ws = precision_loss_doc.format("uint64", "float64") 

617 

618 data[col] = data[col].astype(dtype) 

619 

620 # Check values and upcast if necessary 

621 if dtype == np.int8: 

622 if data[col].max() > 100 or data[col].min() < -127: 

623 data[col] = data[col].astype(np.int16) 

624 elif dtype == np.int16: 

625 if data[col].max() > 32740 or data[col].min() < -32767: 

626 data[col] = data[col].astype(np.int32) 

627 elif dtype == np.int64: 

628 if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: 

629 data[col] = data[col].astype(np.int32) 

630 else: 

631 data[col] = data[col].astype(np.float64) 

632 if data[col].max() >= 2**53 or data[col].min() <= -(2**53): 

633 ws = precision_loss_doc.format("int64", "float64") 

634 elif dtype in (np.float32, np.float64): 

635 if np.isinf(data[col]).any(): 

636 raise ValueError( 

637 f"Column {col} contains infinity or -infinity" 

638 "which is outside the range supported by Stata." 

639 ) 

640 value = data[col].max() 

641 if dtype == np.float32 and value > float32_max: 

642 data[col] = data[col].astype(np.float64) 

643 elif dtype == np.float64: 

644 if value > float64_max: 

645 raise ValueError( 

646 f"Column {col} has a maximum value ({value}) outside the range " 

647 f"supported by Stata ({float64_max})" 

648 ) 

649 if is_nullable_int: 

650 if orig_missing.any(): 

651 # Replace missing by Stata sentinel value 

652 sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name] 

653 data.loc[orig_missing, col] = sentinel 

654 if ws: 

655 warnings.warn( 

656 ws, 

657 PossiblePrecisionLoss, 

658 stacklevel=find_stack_level(), 

659 ) 

660 

661 return data 

662 

663 

664class StataValueLabel: 

665 """ 

666 Parse a categorical column and prepare formatted output 

667 

668 Parameters 

669 ---------- 

670 catarray : Series 

671 Categorical Series to encode 

672 encoding : {"latin-1", "utf-8"} 

673 Encoding to use for value labels. 

674 """ 

675 

676 def __init__( 

677 self, catarray: Series, encoding: Literal["latin-1", "utf-8"] = "latin-1" 

678 ) -> None: 

679 

680 if encoding not in ("latin-1", "utf-8"): 

681 raise ValueError("Only latin-1 and utf-8 are supported.") 

682 self.labname = catarray.name 

683 self._encoding = encoding 

684 categories = catarray.cat.categories 

685 self.value_labels: list[tuple[float, str]] = list( 

686 zip(np.arange(len(categories)), categories) 

687 ) 

688 self.value_labels.sort(key=lambda x: x[0]) 

689 

690 self._prepare_value_labels() 

691 

692 def _prepare_value_labels(self): 

693 """Encode value labels.""" 

694 

695 self.text_len = 0 

696 self.txt: list[bytes] = [] 

697 self.n = 0 

698 # Offsets (length of categories), converted to int32 

699 self.off = np.array([], dtype=np.int32) 

700 # Values, converted to int32 

701 self.val = np.array([], dtype=np.int32) 

702 self.len = 0 

703 

704 # Compute lengths and setup lists of offsets and labels 

705 offsets: list[int] = [] 

706 values: list[float] = [] 

707 for vl in self.value_labels: 

708 category: str | bytes = vl[1] 

709 if not isinstance(category, str): 

710 category = str(category) 

711 warnings.warn( 

712 value_label_mismatch_doc.format(self.labname), 

713 ValueLabelTypeMismatch, 

714 stacklevel=find_stack_level(), 

715 ) 

716 category = category.encode(self._encoding) 

717 offsets.append(self.text_len) 

718 self.text_len += len(category) + 1 # +1 for the padding 

719 values.append(vl[0]) 

720 self.txt.append(category) 

721 self.n += 1 

722 

723 if self.text_len > 32000: 

724 raise ValueError( 

725 "Stata value labels for a single variable must " 

726 "have a combined length less than 32,000 characters." 

727 ) 

728 

729 # Ensure int32 

730 self.off = np.array(offsets, dtype=np.int32) 

731 self.val = np.array(values, dtype=np.int32) 

732 

733 # Total length 

734 self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len 

735 

736 def generate_value_label(self, byteorder: str) -> bytes: 

737 """ 

738 Generate the binary representation of the value labels. 

739 

740 Parameters 

741 ---------- 

742 byteorder : str 

743 Byte order of the output 

744 

745 Returns 

746 ------- 

747 value_label : bytes 

748 Bytes containing the formatted value label 

749 """ 

750 encoding = self._encoding 

751 bio = BytesIO() 

752 null_byte = b"\x00" 

753 

754 # len 

755 bio.write(struct.pack(byteorder + "i", self.len)) 

756 

757 # labname 

758 labname = str(self.labname)[:32].encode(encoding) 

759 lab_len = 32 if encoding not in ("utf-8", "utf8") else 128 

760 labname = _pad_bytes(labname, lab_len + 1) 

761 bio.write(labname) 

762 

763 # padding - 3 bytes 

764 for i in range(3): 

765 bio.write(struct.pack("c", null_byte)) 

766 

767 # value_label_table 

768 # n - int32 

769 bio.write(struct.pack(byteorder + "i", self.n)) 

770 

771 # textlen - int32 

772 bio.write(struct.pack(byteorder + "i", self.text_len)) 

773 

774 # off - int32 array (n elements) 

775 for offset in self.off: 

776 bio.write(struct.pack(byteorder + "i", offset)) 

777 

778 # val - int32 array (n elements) 

779 for value in self.val: 

780 bio.write(struct.pack(byteorder + "i", value)) 

781 

782 # txt - Text labels, null terminated 

783 for text in self.txt: 

784 bio.write(text + null_byte) 

785 

786 return bio.getvalue() 

787 

788 

789class StataNonCatValueLabel(StataValueLabel): 

790 """ 

791 Prepare formatted version of value labels 

792 

793 Parameters 

794 ---------- 

795 labname : str 

796 Value label name 

797 value_labels: Dictionary 

798 Mapping of values to labels 

799 encoding : {"latin-1", "utf-8"} 

800 Encoding to use for value labels. 

801 """ 

802 

803 def __init__( 

804 self, 

805 labname: str, 

806 value_labels: dict[float, str], 

807 encoding: Literal["latin-1", "utf-8"] = "latin-1", 

808 ) -> None: 

809 

810 if encoding not in ("latin-1", "utf-8"): 

811 raise ValueError("Only latin-1 and utf-8 are supported.") 

812 

813 self.labname = labname 

814 self._encoding = encoding 

815 self.value_labels: list[tuple[float, str]] = sorted( 

816 value_labels.items(), key=lambda x: x[0] 

817 ) 

818 self._prepare_value_labels() 

819 

820 

821class StataMissingValue: 

822 """ 

823 An observation's missing value. 

824 

825 Parameters 

826 ---------- 

827 value : {int, float} 

828 The Stata missing value code 

829 

830 Notes 

831 ----- 

832 More information: <https://www.stata.com/help.cgi?missing> 

833 

834 Integer missing values make the code '.', '.a', ..., '.z' to the ranges 

835 101 ... 127 (for int8), 32741 ... 32767 (for int16) and 2147483621 ... 

836 2147483647 (for int32). Missing values for floating point data types are 

837 more complex but the pattern is simple to discern from the following table. 

838 

839 np.float32 missing values (float in Stata) 

840 0000007f . 

841 0008007f .a 

842 0010007f .b 

843 ... 

844 00c0007f .x 

845 00c8007f .y 

846 00d0007f .z 

847 

848 np.float64 missing values (double in Stata) 

849 000000000000e07f . 

850 000000000001e07f .a 

851 000000000002e07f .b 

852 ... 

853 000000000018e07f .x 

854 000000000019e07f .y 

855 00000000001ae07f .z 

856 """ 

857 

858 # Construct a dictionary of missing values 

859 MISSING_VALUES: dict[float, str] = {} 

860 bases: Final = (101, 32741, 2147483621) 

861 for b in bases: 

862 # Conversion to long to avoid hash issues on 32 bit platforms #8968 

863 MISSING_VALUES[b] = "." 

864 for i in range(1, 27): 

865 MISSING_VALUES[i + b] = "." + chr(96 + i) 

866 

867 float32_base: bytes = b"\x00\x00\x00\x7f" 

868 increment: int = struct.unpack("<i", b"\x00\x08\x00\x00")[0] 

869 for i in range(27): 

870 key = struct.unpack("<f", float32_base)[0] 

871 MISSING_VALUES[key] = "." 

872 if i > 0: 

873 MISSING_VALUES[key] += chr(96 + i) 

874 int_value = struct.unpack("<i", struct.pack("<f", key))[0] + increment 

875 float32_base = struct.pack("<i", int_value) 

876 

877 float64_base: bytes = b"\x00\x00\x00\x00\x00\x00\xe0\x7f" 

878 increment = struct.unpack("q", b"\x00\x00\x00\x00\x00\x01\x00\x00")[0] 

879 for i in range(27): 

880 key = struct.unpack("<d", float64_base)[0] 

881 MISSING_VALUES[key] = "." 

882 if i > 0: 

883 MISSING_VALUES[key] += chr(96 + i) 

884 int_value = struct.unpack("q", struct.pack("<d", key))[0] + increment 

885 float64_base = struct.pack("q", int_value) 

886 

887 BASE_MISSING_VALUES: Final = { 

888 "int8": 101, 

889 "int16": 32741, 

890 "int32": 2147483621, 

891 "float32": struct.unpack("<f", float32_base)[0], 

892 "float64": struct.unpack("<d", float64_base)[0], 

893 } 

894 

895 def __init__(self, value: float) -> None: 

896 self._value = value 

897 # Conversion to int to avoid hash issues on 32 bit platforms #8968 

898 value = int(value) if value < 2147483648 else float(value) 

899 self._str = self.MISSING_VALUES[value] 

900 

901 @property 

902 def string(self) -> str: 

903 """ 

904 The Stata representation of the missing value: '.', '.a'..'.z' 

905 

906 Returns 

907 ------- 

908 str 

909 The representation of the missing value. 

910 """ 

911 return self._str 

912 

913 @property 

914 def value(self) -> float: 

915 """ 

916 The binary representation of the missing value. 

917 

918 Returns 

919 ------- 

920 {int, float} 

921 The binary representation of the missing value. 

922 """ 

923 return self._value 

924 

925 def __str__(self) -> str: 

926 return self.string 

927 

928 def __repr__(self) -> str: 

929 return f"{type(self)}({self})" 

930 

931 def __eq__(self, other: Any) -> bool: 

932 return ( 

933 isinstance(other, type(self)) 

934 and self.string == other.string 

935 and self.value == other.value 

936 ) 

937 

938 @classmethod 

939 def get_base_missing_value(cls, dtype: np.dtype) -> float: 

940 if dtype.type is np.int8: 

941 value = cls.BASE_MISSING_VALUES["int8"] 

942 elif dtype.type is np.int16: 

943 value = cls.BASE_MISSING_VALUES["int16"] 

944 elif dtype.type is np.int32: 

945 value = cls.BASE_MISSING_VALUES["int32"] 

946 elif dtype.type is np.float32: 

947 value = cls.BASE_MISSING_VALUES["float32"] 

948 elif dtype.type is np.float64: 

949 value = cls.BASE_MISSING_VALUES["float64"] 

950 else: 

951 raise ValueError("Unsupported dtype") 

952 return value 

953 

954 

955class StataParser: 

956 def __init__(self) -> None: 

957 

958 # type code. 

959 # -------------------- 

960 # str1 1 = 0x01 

961 # str2 2 = 0x02 

962 # ... 

963 # str244 244 = 0xf4 

964 # byte 251 = 0xfb (sic) 

965 # int 252 = 0xfc 

966 # long 253 = 0xfd 

967 # float 254 = 0xfe 

968 # double 255 = 0xff 

969 # -------------------- 

970 # NOTE: the byte type seems to be reserved for categorical variables 

971 # with a label, but the underlying variable is -127 to 100 

972 # we're going to drop the label and cast to int 

973 self.DTYPE_MAP = dict( 

974 list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)])) 

975 + [ 

976 (251, np.dtype(np.int8)), 

977 (252, np.dtype(np.int16)), 

978 (253, np.dtype(np.int32)), 

979 (254, np.dtype(np.float32)), 

980 (255, np.dtype(np.float64)), 

981 ] 

982 ) 

983 self.DTYPE_MAP_XML: dict[int, np.dtype] = { 

984 32768: np.dtype(np.uint8), # Keys to GSO 

985 65526: np.dtype(np.float64), 

986 65527: np.dtype(np.float32), 

987 65528: np.dtype(np.int32), 

988 65529: np.dtype(np.int16), 

989 65530: np.dtype(np.int8), 

990 } 

991 self.TYPE_MAP = list(tuple(range(251)) + tuple("bhlfd")) 

992 self.TYPE_MAP_XML = { 

993 # Not really a Q, unclear how to handle byteswap 

994 32768: "Q", 

995 65526: "d", 

996 65527: "f", 

997 65528: "l", 

998 65529: "h", 

999 65530: "b", 

1000 } 

1001 # NOTE: technically, some of these are wrong. there are more numbers 

1002 # that can be represented. it's the 27 ABOVE and BELOW the max listed 

1003 # numeric data type in [U] 12.2.2 of the 11.2 manual 

1004 float32_min = b"\xff\xff\xff\xfe" 

1005 float32_max = b"\xff\xff\xff\x7e" 

1006 float64_min = b"\xff\xff\xff\xff\xff\xff\xef\xff" 

1007 float64_max = b"\xff\xff\xff\xff\xff\xff\xdf\x7f" 

1008 self.VALID_RANGE = { 

1009 "b": (-127, 100), 

1010 "h": (-32767, 32740), 

1011 "l": (-2147483647, 2147483620), 

1012 "f": ( 

1013 np.float32(struct.unpack("<f", float32_min)[0]), 

1014 np.float32(struct.unpack("<f", float32_max)[0]), 

1015 ), 

1016 "d": ( 

1017 np.float64(struct.unpack("<d", float64_min)[0]), 

1018 np.float64(struct.unpack("<d", float64_max)[0]), 

1019 ), 

1020 } 

1021 

1022 self.OLD_TYPE_MAPPING = { 

1023 98: 251, # byte 

1024 105: 252, # int 

1025 108: 253, # long 

1026 102: 254, # float 

1027 100: 255, # double 

1028 } 

1029 

1030 # These missing values are the generic '.' in Stata, and are used 

1031 # to replace nans 

1032 self.MISSING_VALUES = { 

1033 "b": 101, 

1034 "h": 32741, 

1035 "l": 2147483621, 

1036 "f": np.float32(struct.unpack("<f", b"\x00\x00\x00\x7f")[0]), 

1037 "d": np.float64( 

1038 struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0] 

1039 ), 

1040 } 

1041 self.NUMPY_TYPE_MAP = { 

1042 "b": "i1", 

1043 "h": "i2", 

1044 "l": "i4", 

1045 "f": "f4", 

1046 "d": "f8", 

1047 "Q": "u8", 

1048 } 

1049 

1050 # Reserved words cannot be used as variable names 

1051 self.RESERVED_WORDS = ( 

1052 "aggregate", 

1053 "array", 

1054 "boolean", 

1055 "break", 

1056 "byte", 

1057 "case", 

1058 "catch", 

1059 "class", 

1060 "colvector", 

1061 "complex", 

1062 "const", 

1063 "continue", 

1064 "default", 

1065 "delegate", 

1066 "delete", 

1067 "do", 

1068 "double", 

1069 "else", 

1070 "eltypedef", 

1071 "end", 

1072 "enum", 

1073 "explicit", 

1074 "export", 

1075 "external", 

1076 "float", 

1077 "for", 

1078 "friend", 

1079 "function", 

1080 "global", 

1081 "goto", 

1082 "if", 

1083 "inline", 

1084 "int", 

1085 "local", 

1086 "long", 

1087 "NULL", 

1088 "pragma", 

1089 "protected", 

1090 "quad", 

1091 "rowvector", 

1092 "short", 

1093 "typedef", 

1094 "typename", 

1095 "virtual", 

1096 "_all", 

1097 "_N", 

1098 "_skip", 

1099 "_b", 

1100 "_pi", 

1101 "str#", 

1102 "in", 

1103 "_pred", 

1104 "strL", 

1105 "_coef", 

1106 "_rc", 

1107 "using", 

1108 "_cons", 

1109 "_se", 

1110 "with", 

1111 "_n", 

1112 ) 

1113 

1114 

1115class StataReader(StataParser, abc.Iterator): 

1116 __doc__ = _stata_reader_doc 

1117 

1118 def __init__( 

1119 self, 

1120 path_or_buf: FilePath | ReadBuffer[bytes], 

1121 convert_dates: bool = True, 

1122 convert_categoricals: bool = True, 

1123 index_col: str | None = None, 

1124 convert_missing: bool = False, 

1125 preserve_dtypes: bool = True, 

1126 columns: Sequence[str] | None = None, 

1127 order_categoricals: bool = True, 

1128 chunksize: int | None = None, 

1129 compression: CompressionOptions = "infer", 

1130 storage_options: StorageOptions = None, 

1131 ) -> None: 

1132 super().__init__() 

1133 self.col_sizes: list[int] = [] 

1134 

1135 # Arguments to the reader (can be temporarily overridden in 

1136 # calls to read). 

1137 self._convert_dates = convert_dates 

1138 self._convert_categoricals = convert_categoricals 

1139 self._index_col = index_col 

1140 self._convert_missing = convert_missing 

1141 self._preserve_dtypes = preserve_dtypes 

1142 self._columns = columns 

1143 self._order_categoricals = order_categoricals 

1144 self._encoding = "" 

1145 self._chunksize = chunksize 

1146 self._using_iterator = False 

1147 if self._chunksize is None: 

1148 self._chunksize = 1 

1149 elif not isinstance(chunksize, int) or chunksize <= 0: 

1150 raise ValueError("chunksize must be a positive integer when set.") 

1151 

1152 # State variables for the file 

1153 self._has_string_data = False 

1154 self._missing_values = False 

1155 self._can_read_value_labels = False 

1156 self._column_selector_set = False 

1157 self._value_labels_read = False 

1158 self._data_read = False 

1159 self._dtype: np.dtype | None = None 

1160 self._lines_read = 0 

1161 

1162 self._native_byteorder = _set_endianness(sys.byteorder) 

1163 with get_handle( 

1164 path_or_buf, 

1165 "rb", 

1166 storage_options=storage_options, 

1167 is_text=False, 

1168 compression=compression, 

1169 ) as handles: 

1170 # Copy to BytesIO, and ensure no encoding 

1171 self.path_or_buf = BytesIO(handles.handle.read()) 

1172 

1173 self._read_header() 

1174 self._setup_dtype() 

1175 

1176 def __enter__(self) -> StataReader: 

1177 """enter context manager""" 

1178 return self 

1179 

1180 def __exit__(self, exc_type, exc_value, traceback) -> None: 

1181 """exit context manager""" 

1182 self.close() 

1183 

1184 def close(self) -> None: 

1185 """close the handle if its open""" 

1186 self.path_or_buf.close() 

1187 

1188 def _set_encoding(self) -> None: 

1189 """ 

1190 Set string encoding which depends on file version 

1191 """ 

1192 if self.format_version < 118: 

1193 self._encoding = "latin-1" 

1194 else: 

1195 self._encoding = "utf-8" 

1196 

1197 def _read_header(self) -> None: 

1198 first_char = self.path_or_buf.read(1) 

1199 if struct.unpack("c", first_char)[0] == b"<": 

1200 self._read_new_header() 

1201 else: 

1202 self._read_old_header(first_char) 

1203 

1204 self.has_string_data = len([x for x in self.typlist if type(x) is int]) > 0 

1205 

1206 # calculate size of a data record 

1207 self.col_sizes = [self._calcsize(typ) for typ in self.typlist] 

1208 

1209 def _read_new_header(self) -> None: 

1210 # The first part of the header is common to 117 - 119. 

1211 self.path_or_buf.read(27) # stata_dta><header><release> 

1212 self.format_version = int(self.path_or_buf.read(3)) 

1213 if self.format_version not in [117, 118, 119]: 

1214 raise ValueError(_version_error.format(version=self.format_version)) 

1215 self._set_encoding() 

1216 self.path_or_buf.read(21) # </release><byteorder> 

1217 self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<" 

1218 self.path_or_buf.read(15) # </byteorder><K> 

1219 nvar_type = "H" if self.format_version <= 118 else "I" 

1220 nvar_size = 2 if self.format_version <= 118 else 4 

1221 self.nvar = struct.unpack( 

1222 self.byteorder + nvar_type, self.path_or_buf.read(nvar_size) 

1223 )[0] 

1224 self.path_or_buf.read(7) # </K><N> 

1225 

1226 self.nobs = self._get_nobs() 

1227 self.path_or_buf.read(11) # </N><label> 

1228 self._data_label = self._get_data_label() 

1229 self.path_or_buf.read(19) # </label><timestamp> 

1230 self.time_stamp = self._get_time_stamp() 

1231 self.path_or_buf.read(26) # </timestamp></header><map> 

1232 self.path_or_buf.read(8) # 0x0000000000000000 

1233 self.path_or_buf.read(8) # position of <map> 

1234 

1235 self._seek_vartypes = ( 

1236 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 16 

1237 ) 

1238 self._seek_varnames = ( 

1239 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10 

1240 ) 

1241 self._seek_sortlist = ( 

1242 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 10 

1243 ) 

1244 self._seek_formats = ( 

1245 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 9 

1246 ) 

1247 self._seek_value_label_names = ( 

1248 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 19 

1249 ) 

1250 

1251 # Requires version-specific treatment 

1252 self._seek_variable_labels = self._get_seek_variable_labels() 

1253 

1254 self.path_or_buf.read(8) # <characteristics> 

1255 self.data_location = ( 

1256 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 6 

1257 ) 

1258 self.seek_strls = ( 

1259 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 7 

1260 ) 

1261 self.seek_value_labels = ( 

1262 struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 14 

1263 ) 

1264 

1265 self.typlist, self.dtyplist = self._get_dtypes(self._seek_vartypes) 

1266 

1267 self.path_or_buf.seek(self._seek_varnames) 

1268 self.varlist = self._get_varlist() 

1269 

1270 self.path_or_buf.seek(self._seek_sortlist) 

1271 self.srtlist = struct.unpack( 

1272 self.byteorder + ("h" * (self.nvar + 1)), 

1273 self.path_or_buf.read(2 * (self.nvar + 1)), 

1274 )[:-1] 

1275 

1276 self.path_or_buf.seek(self._seek_formats) 

1277 self.fmtlist = self._get_fmtlist() 

1278 

1279 self.path_or_buf.seek(self._seek_value_label_names) 

1280 self.lbllist = self._get_lbllist() 

1281 

1282 self.path_or_buf.seek(self._seek_variable_labels) 

1283 self._variable_labels = self._get_variable_labels() 

1284 

1285 # Get data type information, works for versions 117-119. 

1286 def _get_dtypes( 

1287 self, seek_vartypes: int 

1288 ) -> tuple[list[int | str], list[str | np.dtype]]: 

1289 

1290 self.path_or_buf.seek(seek_vartypes) 

1291 raw_typlist = [ 

1292 struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] 

1293 for _ in range(self.nvar) 

1294 ] 

1295 

1296 def f(typ: int) -> int | str: 

1297 if typ <= 2045: 

1298 return typ 

1299 try: 

1300 return self.TYPE_MAP_XML[typ] 

1301 except KeyError as err: 

1302 raise ValueError(f"cannot convert stata types [{typ}]") from err 

1303 

1304 typlist = [f(x) for x in raw_typlist] 

1305 

1306 def g(typ: int) -> str | np.dtype: 

1307 if typ <= 2045: 

1308 return str(typ) 

1309 try: 

1310 return self.DTYPE_MAP_XML[typ] 

1311 except KeyError as err: 

1312 raise ValueError(f"cannot convert stata dtype [{typ}]") from err 

1313 

1314 dtyplist = [g(x) for x in raw_typlist] 

1315 

1316 return typlist, dtyplist 

1317 

1318 def _get_varlist(self) -> list[str]: 

1319 # 33 in order formats, 129 in formats 118 and 119 

1320 b = 33 if self.format_version < 118 else 129 

1321 return [self._decode(self.path_or_buf.read(b)) for _ in range(self.nvar)] 

1322 

1323 # Returns the format list 

1324 def _get_fmtlist(self) -> list[str]: 

1325 if self.format_version >= 118: 

1326 b = 57 

1327 elif self.format_version > 113: 

1328 b = 49 

1329 elif self.format_version > 104: 

1330 b = 12 

1331 else: 

1332 b = 7 

1333 

1334 return [self._decode(self.path_or_buf.read(b)) for _ in range(self.nvar)] 

1335 

1336 # Returns the label list 

1337 def _get_lbllist(self) -> list[str]: 

1338 if self.format_version >= 118: 

1339 b = 129 

1340 elif self.format_version > 108: 

1341 b = 33 

1342 else: 

1343 b = 9 

1344 return [self._decode(self.path_or_buf.read(b)) for _ in range(self.nvar)] 

1345 

1346 def _get_variable_labels(self) -> list[str]: 

1347 if self.format_version >= 118: 

1348 vlblist = [ 

1349 self._decode(self.path_or_buf.read(321)) for _ in range(self.nvar) 

1350 ] 

1351 elif self.format_version > 105: 

1352 vlblist = [ 

1353 self._decode(self.path_or_buf.read(81)) for _ in range(self.nvar) 

1354 ] 

1355 else: 

1356 vlblist = [ 

1357 self._decode(self.path_or_buf.read(32)) for _ in range(self.nvar) 

1358 ] 

1359 return vlblist 

1360 

1361 def _get_nobs(self) -> int: 

1362 if self.format_version >= 118: 

1363 return struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] 

1364 else: 

1365 return struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] 

1366 

1367 def _get_data_label(self) -> str: 

1368 if self.format_version >= 118: 

1369 strlen = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] 

1370 return self._decode(self.path_or_buf.read(strlen)) 

1371 elif self.format_version == 117: 

1372 strlen = struct.unpack("b", self.path_or_buf.read(1))[0] 

1373 return self._decode(self.path_or_buf.read(strlen)) 

1374 elif self.format_version > 105: 

1375 return self._decode(self.path_or_buf.read(81)) 

1376 else: 

1377 return self._decode(self.path_or_buf.read(32)) 

1378 

1379 def _get_time_stamp(self) -> str: 

1380 if self.format_version >= 118: 

1381 strlen = struct.unpack("b", self.path_or_buf.read(1))[0] 

1382 return self.path_or_buf.read(strlen).decode("utf-8") 

1383 elif self.format_version == 117: 

1384 strlen = struct.unpack("b", self.path_or_buf.read(1))[0] 

1385 return self._decode(self.path_or_buf.read(strlen)) 

1386 elif self.format_version > 104: 

1387 return self._decode(self.path_or_buf.read(18)) 

1388 else: 

1389 raise ValueError() 

1390 

1391 def _get_seek_variable_labels(self) -> int: 

1392 if self.format_version == 117: 

1393 self.path_or_buf.read(8) # <variable_labels>, throw away 

1394 # Stata 117 data files do not follow the described format. This is 

1395 # a work around that uses the previous label, 33 bytes for each 

1396 # variable, 20 for the closing tag and 17 for the opening tag 

1397 return self._seek_value_label_names + (33 * self.nvar) + 20 + 17 

1398 elif self.format_version >= 118: 

1399 return struct.unpack(self.byteorder + "q", self.path_or_buf.read(8))[0] + 17 

1400 else: 

1401 raise ValueError() 

1402 

1403 def _read_old_header(self, first_char: bytes) -> None: 

1404 self.format_version = struct.unpack("b", first_char)[0] 

1405 if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: 

1406 raise ValueError(_version_error.format(version=self.format_version)) 

1407 self._set_encoding() 

1408 self.byteorder = ( 

1409 struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<" 

1410 ) 

1411 self.filetype = struct.unpack("b", self.path_or_buf.read(1))[0] 

1412 self.path_or_buf.read(1) # unused 

1413 

1414 self.nvar = struct.unpack(self.byteorder + "H", self.path_or_buf.read(2))[0] 

1415 self.nobs = self._get_nobs() 

1416 

1417 self._data_label = self._get_data_label() 

1418 

1419 self.time_stamp = self._get_time_stamp() 

1420 

1421 # descriptors 

1422 if self.format_version > 108: 

1423 typlist = [ord(self.path_or_buf.read(1)) for _ in range(self.nvar)] 

1424 else: 

1425 buf = self.path_or_buf.read(self.nvar) 

1426 typlistb = np.frombuffer(buf, dtype=np.uint8) 

1427 typlist = [] 

1428 for tp in typlistb: 

1429 if tp in self.OLD_TYPE_MAPPING: 

1430 typlist.append(self.OLD_TYPE_MAPPING[tp]) 

1431 else: 

1432 typlist.append(tp - 127) # bytes 

1433 

1434 try: 

1435 self.typlist = [self.TYPE_MAP[typ] for typ in typlist] 

1436 except ValueError as err: 

1437 invalid_types = ",".join([str(x) for x in typlist]) 

1438 raise ValueError(f"cannot convert stata types [{invalid_types}]") from err 

1439 try: 

1440 self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] 

1441 except ValueError as err: 

1442 invalid_dtypes = ",".join([str(x) for x in typlist]) 

1443 raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") from err 

1444 

1445 if self.format_version > 108: 

1446 self.varlist = [ 

1447 self._decode(self.path_or_buf.read(33)) for _ in range(self.nvar) 

1448 ] 

1449 else: 

1450 self.varlist = [ 

1451 self._decode(self.path_or_buf.read(9)) for _ in range(self.nvar) 

1452 ] 

1453 self.srtlist = struct.unpack( 

1454 self.byteorder + ("h" * (self.nvar + 1)), 

1455 self.path_or_buf.read(2 * (self.nvar + 1)), 

1456 )[:-1] 

1457 

1458 self.fmtlist = self._get_fmtlist() 

1459 

1460 self.lbllist = self._get_lbllist() 

1461 

1462 self._variable_labels = self._get_variable_labels() 

1463 

1464 # ignore expansion fields (Format 105 and later) 

1465 # When reading, read five bytes; the last four bytes now tell you 

1466 # the size of the next read, which you discard. You then continue 

1467 # like this until you read 5 bytes of zeros. 

1468 

1469 if self.format_version > 104: 

1470 while True: 

1471 data_type = struct.unpack( 

1472 self.byteorder + "b", self.path_or_buf.read(1) 

1473 )[0] 

1474 if self.format_version > 108: 

1475 data_len = struct.unpack( 

1476 self.byteorder + "i", self.path_or_buf.read(4) 

1477 )[0] 

1478 else: 

1479 data_len = struct.unpack( 

1480 self.byteorder + "h", self.path_or_buf.read(2) 

1481 )[0] 

1482 if data_type == 0: 

1483 break 

1484 self.path_or_buf.read(data_len) 

1485 

1486 # necessary data to continue parsing 

1487 self.data_location = self.path_or_buf.tell() 

1488 

1489 def _setup_dtype(self) -> np.dtype: 

1490 """Map between numpy and state dtypes""" 

1491 if self._dtype is not None: 

1492 return self._dtype 

1493 

1494 dtypes = [] # Convert struct data types to numpy data type 

1495 for i, typ in enumerate(self.typlist): 

1496 if typ in self.NUMPY_TYPE_MAP: 

1497 typ = cast(str, typ) # only strs in NUMPY_TYPE_MAP 

1498 dtypes.append(("s" + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ])) 

1499 else: 

1500 dtypes.append(("s" + str(i), "S" + str(typ))) 

1501 self._dtype = np.dtype(dtypes) 

1502 

1503 return self._dtype 

1504 

1505 def _calcsize(self, fmt: int | str) -> int: 

1506 if isinstance(fmt, int): 

1507 return fmt 

1508 return struct.calcsize(self.byteorder + fmt) 

1509 

1510 def _decode(self, s: bytes) -> str: 

1511 # have bytes not strings, so must decode 

1512 s = s.partition(b"\0")[0] 

1513 try: 

1514 return s.decode(self._encoding) 

1515 except UnicodeDecodeError: 

1516 # GH 25960, fallback to handle incorrect format produced when 117 

1517 # files are converted to 118 files in Stata 

1518 encoding = self._encoding 

1519 msg = f""" 

1520One or more strings in the dta file could not be decoded using {encoding}, and 

1521so the fallback encoding of latin-1 is being used. This can happen when a file 

1522has been incorrectly encoded by Stata or some other software. You should verify 

1523the string values returned are correct.""" 

1524 warnings.warn( 

1525 msg, 

1526 UnicodeWarning, 

1527 stacklevel=find_stack_level(), 

1528 ) 

1529 return s.decode("latin-1") 

1530 

1531 def _read_value_labels(self) -> None: 

1532 if self._value_labels_read: 

1533 # Don't read twice 

1534 return 

1535 if self.format_version <= 108: 

1536 # Value labels are not supported in version 108 and earlier. 

1537 self._value_labels_read = True 

1538 self.value_label_dict: dict[str, dict[float, str]] = {} 

1539 return 

1540 

1541 if self.format_version >= 117: 

1542 self.path_or_buf.seek(self.seek_value_labels) 

1543 else: 

1544 assert self._dtype is not None 

1545 offset = self.nobs * self._dtype.itemsize 

1546 self.path_or_buf.seek(self.data_location + offset) 

1547 

1548 self._value_labels_read = True 

1549 self.value_label_dict = {} 

1550 

1551 while True: 

1552 if self.format_version >= 117: 

1553 if self.path_or_buf.read(5) == b"</val": # <lbl> 

1554 break # end of value label table 

1555 

1556 slength = self.path_or_buf.read(4) 

1557 if not slength: 

1558 break # end of value label table (format < 117) 

1559 if self.format_version <= 117: 

1560 labname = self._decode(self.path_or_buf.read(33)) 

1561 else: 

1562 labname = self._decode(self.path_or_buf.read(129)) 

1563 self.path_or_buf.read(3) # padding 

1564 

1565 n = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] 

1566 txtlen = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] 

1567 off = np.frombuffer( 

1568 self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n 

1569 ) 

1570 val = np.frombuffer( 

1571 self.path_or_buf.read(4 * n), dtype=self.byteorder + "i4", count=n 

1572 ) 

1573 ii = np.argsort(off) 

1574 off = off[ii] 

1575 val = val[ii] 

1576 txt = self.path_or_buf.read(txtlen) 

1577 self.value_label_dict[labname] = {} 

1578 for i in range(n): 

1579 end = off[i + 1] if i < n - 1 else txtlen 

1580 self.value_label_dict[labname][val[i]] = self._decode(txt[off[i] : end]) 

1581 if self.format_version >= 117: 

1582 self.path_or_buf.read(6) # </lbl> 

1583 self._value_labels_read = True 

1584 

1585 def _read_strls(self) -> None: 

1586 self.path_or_buf.seek(self.seek_strls) 

1587 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS 

1588 self.GSO = {"0": ""} 

1589 while True: 

1590 if self.path_or_buf.read(3) != b"GSO": 

1591 break 

1592 

1593 if self.format_version == 117: 

1594 v_o = struct.unpack(self.byteorder + "Q", self.path_or_buf.read(8))[0] 

1595 else: 

1596 buf = self.path_or_buf.read(12) 

1597 # Only tested on little endian file on little endian machine. 

1598 v_size = 2 if self.format_version == 118 else 3 

1599 if self.byteorder == "<": 

1600 buf = buf[0:v_size] + buf[4 : (12 - v_size)] 

1601 else: 

1602 # This path may not be correct, impossible to test 

1603 buf = buf[0:v_size] + buf[(4 + v_size) :] 

1604 v_o = struct.unpack("Q", buf)[0] 

1605 typ = struct.unpack("B", self.path_or_buf.read(1))[0] 

1606 length = struct.unpack(self.byteorder + "I", self.path_or_buf.read(4))[0] 

1607 va = self.path_or_buf.read(length) 

1608 if typ == 130: 

1609 decoded_va = va[0:-1].decode(self._encoding) 

1610 else: 

1611 # Stata says typ 129 can be binary, so use str 

1612 decoded_va = str(va) 

1613 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS 

1614 self.GSO[str(v_o)] = decoded_va 

1615 

1616 def __next__(self) -> DataFrame: 

1617 self._using_iterator = True 

1618 return self.read(nrows=self._chunksize) 

1619 

1620 def get_chunk(self, size: int | None = None) -> DataFrame: 

1621 """ 

1622 Reads lines from Stata file and returns as dataframe 

1623 

1624 Parameters 

1625 ---------- 

1626 size : int, defaults to None 

1627 Number of lines to read. If None, reads whole file. 

1628 

1629 Returns 

1630 ------- 

1631 DataFrame 

1632 """ 

1633 if size is None: 

1634 size = self._chunksize 

1635 return self.read(nrows=size) 

1636 

1637 @Appender(_read_method_doc) 

1638 def read( 

1639 self, 

1640 nrows: int | None = None, 

1641 convert_dates: bool | None = None, 

1642 convert_categoricals: bool | None = None, 

1643 index_col: str | None = None, 

1644 convert_missing: bool | None = None, 

1645 preserve_dtypes: bool | None = None, 

1646 columns: Sequence[str] | None = None, 

1647 order_categoricals: bool | None = None, 

1648 ) -> DataFrame: 

1649 # Handle empty file or chunk. If reading incrementally raise 

1650 # StopIteration. If reading the whole thing return an empty 

1651 # data frame. 

1652 if (self.nobs == 0) and (nrows is None): 

1653 self._can_read_value_labels = True 

1654 self._data_read = True 

1655 self.close() 

1656 return DataFrame(columns=self.varlist) 

1657 

1658 # Handle options 

1659 if convert_dates is None: 

1660 convert_dates = self._convert_dates 

1661 if convert_categoricals is None: 

1662 convert_categoricals = self._convert_categoricals 

1663 if convert_missing is None: 

1664 convert_missing = self._convert_missing 

1665 if preserve_dtypes is None: 

1666 preserve_dtypes = self._preserve_dtypes 

1667 if columns is None: 

1668 columns = self._columns 

1669 if order_categoricals is None: 

1670 order_categoricals = self._order_categoricals 

1671 if index_col is None: 

1672 index_col = self._index_col 

1673 

1674 if nrows is None: 

1675 nrows = self.nobs 

1676 

1677 if (self.format_version >= 117) and (not self._value_labels_read): 

1678 self._can_read_value_labels = True 

1679 self._read_strls() 

1680 

1681 # Read data 

1682 assert self._dtype is not None 

1683 dtype = self._dtype 

1684 max_read_len = (self.nobs - self._lines_read) * dtype.itemsize 

1685 read_len = nrows * dtype.itemsize 

1686 read_len = min(read_len, max_read_len) 

1687 if read_len <= 0: 

1688 # Iterator has finished, should never be here unless 

1689 # we are reading the file incrementally 

1690 if convert_categoricals: 

1691 self._read_value_labels() 

1692 self.close() 

1693 raise StopIteration 

1694 offset = self._lines_read * dtype.itemsize 

1695 self.path_or_buf.seek(self.data_location + offset) 

1696 read_lines = min(nrows, self.nobs - self._lines_read) 

1697 raw_data = np.frombuffer( 

1698 self.path_or_buf.read(read_len), dtype=dtype, count=read_lines 

1699 ) 

1700 

1701 self._lines_read += read_lines 

1702 if self._lines_read == self.nobs: 

1703 self._can_read_value_labels = True 

1704 self._data_read = True 

1705 # if necessary, swap the byte order to native here 

1706 if self.byteorder != self._native_byteorder: 

1707 raw_data = raw_data.byteswap().newbyteorder() 

1708 

1709 if convert_categoricals: 

1710 self._read_value_labels() 

1711 

1712 if len(raw_data) == 0: 

1713 data = DataFrame(columns=self.varlist) 

1714 else: 

1715 data = DataFrame.from_records(raw_data) 

1716 data.columns = Index(self.varlist) 

1717 

1718 # If index is not specified, use actual row number rather than 

1719 # restarting at 0 for each chunk. 

1720 if index_col is None: 

1721 rng = np.arange(self._lines_read - read_lines, self._lines_read) 

1722 data.index = Index(rng) # set attr instead of set_index to avoid copy 

1723 

1724 if columns is not None: 

1725 try: 

1726 data = self._do_select_columns(data, columns) 

1727 except ValueError: 

1728 self.close() 

1729 raise 

1730 

1731 # Decode strings 

1732 for col, typ in zip(data, self.typlist): 

1733 if type(typ) is int: 

1734 data[col] = data[col].apply(self._decode, convert_dtype=True) 

1735 

1736 data = self._insert_strls(data) 

1737 

1738 cols_ = np.where([dtyp is not None for dtyp in self.dtyplist])[0] 

1739 # Convert columns (if needed) to match input type 

1740 ix = data.index 

1741 requires_type_conversion = False 

1742 data_formatted = [] 

1743 for i in cols_: 

1744 if self.dtyplist[i] is not None: 

1745 col = data.columns[i] 

1746 dtype = data[col].dtype 

1747 if dtype != np.dtype(object) and dtype != self.dtyplist[i]: 

1748 requires_type_conversion = True 

1749 data_formatted.append( 

1750 (col, Series(data[col], ix, self.dtyplist[i])) 

1751 ) 

1752 else: 

1753 data_formatted.append((col, data[col])) 

1754 if requires_type_conversion: 

1755 data = DataFrame.from_dict(dict(data_formatted)) 

1756 del data_formatted 

1757 

1758 data = self._do_convert_missing(data, convert_missing) 

1759 

1760 if convert_dates: 

1761 

1762 def any_startswith(x: str) -> bool: 

1763 return any(x.startswith(fmt) for fmt in _date_formats) 

1764 

1765 cols = np.where([any_startswith(x) for x in self.fmtlist])[0] 

1766 for i in cols: 

1767 col = data.columns[i] 

1768 try: 

1769 data[col] = _stata_elapsed_date_to_datetime_vec( 

1770 data[col], self.fmtlist[i] 

1771 ) 

1772 except ValueError: 

1773 self.close() 

1774 raise 

1775 

1776 if convert_categoricals and self.format_version > 108: 

1777 data = self._do_convert_categoricals( 

1778 data, self.value_label_dict, self.lbllist, order_categoricals 

1779 ) 

1780 

1781 if not preserve_dtypes: 

1782 retyped_data = [] 

1783 convert = False 

1784 for col in data: 

1785 dtype = data[col].dtype 

1786 if dtype in (np.dtype(np.float16), np.dtype(np.float32)): 

1787 dtype = np.dtype(np.float64) 

1788 convert = True 

1789 elif dtype in ( 

1790 np.dtype(np.int8), 

1791 np.dtype(np.int16), 

1792 np.dtype(np.int32), 

1793 ): 

1794 dtype = np.dtype(np.int64) 

1795 convert = True 

1796 retyped_data.append((col, data[col].astype(dtype))) 

1797 if convert: 

1798 data = DataFrame.from_dict(dict(retyped_data)) 

1799 

1800 if index_col is not None: 

1801 data = data.set_index(data.pop(index_col)) 

1802 

1803 return data 

1804 

1805 def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame: 

1806 # Check for missing values, and replace if found 

1807 replacements = {} 

1808 for i, colname in enumerate(data): 

1809 fmt = self.typlist[i] 

1810 if fmt not in self.VALID_RANGE: 

1811 continue 

1812 

1813 fmt = cast(str, fmt) # only strs in VALID_RANGE 

1814 nmin, nmax = self.VALID_RANGE[fmt] 

1815 series = data[colname] 

1816 

1817 # appreciably faster to do this with ndarray instead of Series 

1818 svals = series._values 

1819 missing = (svals < nmin) | (svals > nmax) 

1820 

1821 if not missing.any(): 

1822 continue 

1823 

1824 if convert_missing: # Replacement follows Stata notation 

1825 missing_loc = np.nonzero(np.asarray(missing))[0] 

1826 umissing, umissing_loc = np.unique(series[missing], return_inverse=True) 

1827 replacement = Series(series, dtype=object) 

1828 for j, um in enumerate(umissing): 

1829 missing_value = StataMissingValue(um) 

1830 

1831 loc = missing_loc[umissing_loc == j] 

1832 replacement.iloc[loc] = missing_value 

1833 else: # All replacements are identical 

1834 dtype = series.dtype 

1835 if dtype not in (np.float32, np.float64): 

1836 dtype = np.float64 

1837 replacement = Series(series, dtype=dtype) 

1838 if not replacement._values.flags["WRITEABLE"]: 

1839 # only relevant for ArrayManager; construction 

1840 # path for BlockManager ensures writeability 

1841 replacement = replacement.copy() 

1842 # Note: operating on ._values is much faster than directly 

1843 # TODO: can we fix that? 

1844 replacement._values[missing] = np.nan 

1845 replacements[colname] = replacement 

1846 

1847 if replacements: 

1848 for col in replacements: 

1849 data[col] = replacements[col] 

1850 return data 

1851 

1852 def _insert_strls(self, data: DataFrame) -> DataFrame: 

1853 if not hasattr(self, "GSO") or len(self.GSO) == 0: 

1854 return data 

1855 for i, typ in enumerate(self.typlist): 

1856 if typ != "Q": 

1857 continue 

1858 # Wrap v_o in a string to allow uint64 values as keys on 32bit OS 

1859 data.iloc[:, i] = [self.GSO[str(k)] for k in data.iloc[:, i]] 

1860 return data 

1861 

1862 def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFrame: 

1863 

1864 if not self._column_selector_set: 

1865 column_set = set(columns) 

1866 if len(column_set) != len(columns): 

1867 raise ValueError("columns contains duplicate entries") 

1868 unmatched = column_set.difference(data.columns) 

1869 if unmatched: 

1870 joined = ", ".join(list(unmatched)) 

1871 raise ValueError( 

1872 "The following columns were not " 

1873 f"found in the Stata data set: {joined}" 

1874 ) 

1875 # Copy information for retained columns for later processing 

1876 dtyplist = [] 

1877 typlist = [] 

1878 fmtlist = [] 

1879 lbllist = [] 

1880 for col in columns: 

1881 i = data.columns.get_loc(col) 

1882 dtyplist.append(self.dtyplist[i]) 

1883 typlist.append(self.typlist[i]) 

1884 fmtlist.append(self.fmtlist[i]) 

1885 lbllist.append(self.lbllist[i]) 

1886 

1887 self.dtyplist = dtyplist 

1888 self.typlist = typlist 

1889 self.fmtlist = fmtlist 

1890 self.lbllist = lbllist 

1891 self._column_selector_set = True 

1892 

1893 return data[columns] 

1894 

1895 def _do_convert_categoricals( 

1896 self, 

1897 data: DataFrame, 

1898 value_label_dict: dict[str, dict[float, str]], 

1899 lbllist: Sequence[str], 

1900 order_categoricals: bool, 

1901 ) -> DataFrame: 

1902 """ 

1903 Converts categorical columns to Categorical type. 

1904 """ 

1905 value_labels = list(value_label_dict.keys()) 

1906 cat_converted_data = [] 

1907 for col, label in zip(data, lbllist): 

1908 if label in value_labels: 

1909 # Explicit call with ordered=True 

1910 vl = value_label_dict[label] 

1911 keys = np.array(list(vl.keys())) 

1912 column = data[col] 

1913 key_matches = column.isin(keys) 

1914 if self._using_iterator and key_matches.all(): 

1915 initial_categories: np.ndarray | None = keys 

1916 # If all categories are in the keys and we are iterating, 

1917 # use the same keys for all chunks. If some are missing 

1918 # value labels, then we will fall back to the categories 

1919 # varying across chunks. 

1920 else: 

1921 if self._using_iterator: 

1922 # warn is using an iterator 

1923 warnings.warn( 

1924 categorical_conversion_warning, 

1925 CategoricalConversionWarning, 

1926 stacklevel=find_stack_level(), 

1927 ) 

1928 initial_categories = None 

1929 cat_data = Categorical( 

1930 column, categories=initial_categories, ordered=order_categoricals 

1931 ) 

1932 if initial_categories is None: 

1933 # If None here, then we need to match the cats in the Categorical 

1934 categories = [] 

1935 for category in cat_data.categories: 

1936 if category in vl: 

1937 categories.append(vl[category]) 

1938 else: 

1939 categories.append(category) 

1940 else: 

1941 # If all cats are matched, we can use the values 

1942 categories = list(vl.values()) 

1943 try: 

1944 # Try to catch duplicate categories 

1945 # TODO: if we get a non-copying rename_categories, use that 

1946 cat_data = cat_data.rename_categories(categories) 

1947 except ValueError as err: 

1948 vc = Series(categories).value_counts() 

1949 repeated_cats = list(vc.index[vc > 1]) 

1950 repeats = "-" * 80 + "\n" + "\n".join(repeated_cats) 

1951 # GH 25772 

1952 msg = f""" 

1953Value labels for column {col} are not unique. These cannot be converted to 

1954pandas categoricals. 

1955 

1956Either read the file with `convert_categoricals` set to False or use the 

1957low level interface in `StataReader` to separately read the values and the 

1958value_labels. 

1959 

1960The repeated labels are: 

1961{repeats} 

1962""" 

1963 raise ValueError(msg) from err 

1964 # TODO: is the next line needed above in the data(...) method? 

1965 cat_series = Series(cat_data, index=data.index) 

1966 cat_converted_data.append((col, cat_series)) 

1967 else: 

1968 cat_converted_data.append((col, data[col])) 

1969 data = DataFrame(dict(cat_converted_data), copy=False) 

1970 return data 

1971 

1972 @property 

1973 def data_label(self) -> str: 

1974 """ 

1975 Return data label of Stata file. 

1976 """ 

1977 return self._data_label 

1978 

1979 def variable_labels(self) -> dict[str, str]: 

1980 """ 

1981 Return a dict associating each variable name with corresponding label. 

1982 

1983 Returns 

1984 ------- 

1985 dict 

1986 """ 

1987 return dict(zip(self.varlist, self._variable_labels)) 

1988 

1989 def value_labels(self) -> dict[str, dict[float, str]]: 

1990 """ 

1991 Return a nested dict associating each variable name to its value and label. 

1992 

1993 Returns 

1994 ------- 

1995 dict 

1996 """ 

1997 if not self._value_labels_read: 

1998 self._read_value_labels() 

1999 

2000 return self.value_label_dict 

2001 

2002 

2003@Appender(_read_stata_doc) 

2004@deprecate_nonkeyword_arguments(version=None, allowed_args=["filepath_or_buffer"]) 

2005def read_stata( 

2006 filepath_or_buffer: FilePath | ReadBuffer[bytes], 

2007 convert_dates: bool = True, 

2008 convert_categoricals: bool = True, 

2009 index_col: str | None = None, 

2010 convert_missing: bool = False, 

2011 preserve_dtypes: bool = True, 

2012 columns: Sequence[str] | None = None, 

2013 order_categoricals: bool = True, 

2014 chunksize: int | None = None, 

2015 iterator: bool = False, 

2016 compression: CompressionOptions = "infer", 

2017 storage_options: StorageOptions = None, 

2018) -> DataFrame | StataReader: 

2019 

2020 reader = StataReader( 

2021 filepath_or_buffer, 

2022 convert_dates=convert_dates, 

2023 convert_categoricals=convert_categoricals, 

2024 index_col=index_col, 

2025 convert_missing=convert_missing, 

2026 preserve_dtypes=preserve_dtypes, 

2027 columns=columns, 

2028 order_categoricals=order_categoricals, 

2029 chunksize=chunksize, 

2030 storage_options=storage_options, 

2031 compression=compression, 

2032 ) 

2033 

2034 if iterator or chunksize: 

2035 return reader 

2036 

2037 with reader: 

2038 return reader.read() 

2039 

2040 

2041def _set_endianness(endianness: str) -> str: 

2042 if endianness.lower() in ["<", "little"]: 

2043 return "<" 

2044 elif endianness.lower() in [">", "big"]: 

2045 return ">" 

2046 else: # pragma : no cover 

2047 raise ValueError(f"Endianness {endianness} not understood") 

2048 

2049 

2050def _pad_bytes(name: AnyStr, length: int) -> AnyStr: 

2051 """ 

2052 Take a char string and pads it with null bytes until it's length chars. 

2053 """ 

2054 if isinstance(name, bytes): 

2055 return name + b"\x00" * (length - len(name)) 

2056 return name + "\x00" * (length - len(name)) 

2057 

2058 

2059def _convert_datetime_to_stata_type(fmt: str) -> np.dtype: 

2060 """ 

2061 Convert from one of the stata date formats to a type in TYPE_MAP. 

2062 """ 

2063 if fmt in [ 

2064 "tc", 

2065 "%tc", 

2066 "td", 

2067 "%td", 

2068 "tw", 

2069 "%tw", 

2070 "tm", 

2071 "%tm", 

2072 "tq", 

2073 "%tq", 

2074 "th", 

2075 "%th", 

2076 "ty", 

2077 "%ty", 

2078 ]: 

2079 return np.dtype(np.float64) # Stata expects doubles for SIFs 

2080 else: 

2081 raise NotImplementedError(f"Format {fmt} not implemented") 

2082 

2083 

2084def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict: 

2085 new_dict = {} 

2086 for key in convert_dates: 

2087 if not convert_dates[key].startswith("%"): # make sure proper fmts 

2088 convert_dates[key] = "%" + convert_dates[key] 

2089 if key in varlist: 

2090 new_dict.update({varlist.index(key): convert_dates[key]}) 

2091 else: 

2092 if not isinstance(key, int): 

2093 raise ValueError("convert_dates key must be a column or an integer") 

2094 new_dict.update({key: convert_dates[key]}) 

2095 return new_dict 

2096 

2097 

2098def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int: 

2099 """ 

2100 Convert dtype types to stata types. Returns the byte of the given ordinal. 

2101 See TYPE_MAP and comments for an explanation. This is also explained in 

2102 the dta spec. 

2103 1 - 244 are strings of this length 

2104 Pandas Stata 

2105 251 - for int8 byte 

2106 252 - for int16 int 

2107 253 - for int32 long 

2108 254 - for float32 float 

2109 255 - for double double 

2110 

2111 If there are dates to convert, then dtype will already have the correct 

2112 type inserted. 

2113 """ 

2114 # TODO: expand to handle datetime to integer conversion 

2115 if dtype.type is np.object_: # try to coerce it to the biggest string 

2116 # not memory efficient, what else could we 

2117 # do? 

2118 itemsize = max_len_string_array(ensure_object(column._values)) 

2119 return max(itemsize, 1) 

2120 elif dtype.type is np.float64: 

2121 return 255 

2122 elif dtype.type is np.float32: 

2123 return 254 

2124 elif dtype.type is np.int32: 

2125 return 253 

2126 elif dtype.type is np.int16: 

2127 return 252 

2128 elif dtype.type is np.int8: 

2129 return 251 

2130 else: # pragma : no cover 

2131 raise NotImplementedError(f"Data type {dtype} not supported.") 

2132 

2133 

2134def _dtype_to_default_stata_fmt( 

2135 dtype, column: Series, dta_version: int = 114, force_strl: bool = False 

2136) -> str: 

2137 """ 

2138 Map numpy dtype to stata's default format for this type. Not terribly 

2139 important since users can change this in Stata. Semantics are 

2140 

2141 object -> "%DDs" where DD is the length of the string. If not a string, 

2142 raise ValueError 

2143 float64 -> "%10.0g" 

2144 float32 -> "%9.0g" 

2145 int64 -> "%9.0g" 

2146 int32 -> "%12.0g" 

2147 int16 -> "%8.0g" 

2148 int8 -> "%8.0g" 

2149 strl -> "%9s" 

2150 """ 

2151 # TODO: Refactor to combine type with format 

2152 # TODO: expand this to handle a default datetime format? 

2153 if dta_version < 117: 

2154 max_str_len = 244 

2155 else: 

2156 max_str_len = 2045 

2157 if force_strl: 

2158 return "%9s" 

2159 if dtype.type is np.object_: 

2160 itemsize = max_len_string_array(ensure_object(column._values)) 

2161 if itemsize > max_str_len: 

2162 if dta_version >= 117: 

2163 return "%9s" 

2164 else: 

2165 raise ValueError(excessive_string_length_error.format(column.name)) 

2166 return "%" + str(max(itemsize, 1)) + "s" 

2167 elif dtype == np.float64: 

2168 return "%10.0g" 

2169 elif dtype == np.float32: 

2170 return "%9.0g" 

2171 elif dtype == np.int32: 

2172 return "%12.0g" 

2173 elif dtype == np.int8 or dtype == np.int16: 

2174 return "%8.0g" 

2175 else: # pragma : no cover 

2176 raise NotImplementedError(f"Data type {dtype} not supported.") 

2177 

2178 

2179@doc( 

2180 storage_options=_shared_docs["storage_options"], 

2181 compression_options=_shared_docs["compression_options"] % "fname", 

2182) 

2183class StataWriter(StataParser): 

2184 """ 

2185 A class for writing Stata binary dta files 

2186 

2187 Parameters 

2188 ---------- 

2189 fname : path (string), buffer or path object 

2190 string, path object (pathlib.Path or py._path.local.LocalPath) or 

2191 object implementing a binary write() functions. If using a buffer 

2192 then the buffer will not be automatically closed after the file 

2193 is written. 

2194 data : DataFrame 

2195 Input to save 

2196 convert_dates : dict 

2197 Dictionary mapping columns containing datetime types to stata internal 

2198 format to use when writing the dates. Options are 'tc', 'td', 'tm', 

2199 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. 

2200 Datetime columns that do not have a conversion type specified will be 

2201 converted to 'tc'. Raises NotImplementedError if a datetime column has 

2202 timezone information 

2203 write_index : bool 

2204 Write the index to Stata dataset. 

2205 byteorder : str 

2206 Can be ">", "<", "little", or "big". default is `sys.byteorder` 

2207 time_stamp : datetime 

2208 A datetime to use as file creation date. Default is the current time 

2209 data_label : str 

2210 A label for the data set. Must be 80 characters or smaller. 

2211 variable_labels : dict 

2212 Dictionary containing columns as keys and variable labels as values. 

2213 Each label must be 80 characters or smaller. 

2214 {compression_options} 

2215 

2216 .. versionadded:: 1.1.0 

2217 

2218 .. versionchanged:: 1.4.0 Zstandard support. 

2219 

2220 {storage_options} 

2221 

2222 .. versionadded:: 1.2.0 

2223 

2224 value_labels : dict of dicts 

2225 Dictionary containing columns as keys and dictionaries of column value 

2226 to labels as values. The combined length of all labels for a single 

2227 variable must be 32,000 characters or smaller. 

2228 

2229 .. versionadded:: 1.4.0 

2230 

2231 Returns 

2232 ------- 

2233 writer : StataWriter instance 

2234 The StataWriter instance has a write_file method, which will 

2235 write the file to the given `fname`. 

2236 

2237 Raises 

2238 ------ 

2239 NotImplementedError 

2240 * If datetimes contain timezone information 

2241 ValueError 

2242 * Columns listed in convert_dates are neither datetime64[ns] 

2243 or datetime.datetime 

2244 * Column dtype is not representable in Stata 

2245 * Column listed in convert_dates is not in DataFrame 

2246 * Categorical label contains more than 32,000 characters 

2247 

2248 Examples 

2249 -------- 

2250 >>> data = pd.DataFrame([[1.0, 1]], columns=['a', 'b']) 

2251 >>> writer = StataWriter('./data_file.dta', data) 

2252 >>> writer.write_file() 

2253 

2254 Directly write a zip file 

2255 >>> compression = {{"method": "zip", "archive_name": "data_file.dta"}} 

2256 >>> writer = StataWriter('./data_file.zip', data, compression=compression) 

2257 >>> writer.write_file() 

2258 

2259 Save a DataFrame with dates 

2260 >>> from datetime import datetime 

2261 >>> data = pd.DataFrame([[datetime(2000,1,1)]], columns=['date']) 

2262 >>> writer = StataWriter('./date_data_file.dta', data, {{'date' : 'tw'}}) 

2263 >>> writer.write_file() 

2264 """ 

2265 

2266 _max_string_length = 244 

2267 _encoding: Literal["latin-1", "utf-8"] = "latin-1" 

2268 

2269 def __init__( 

2270 self, 

2271 fname: FilePath | WriteBuffer[bytes], 

2272 data: DataFrame, 

2273 convert_dates: dict[Hashable, str] | None = None, 

2274 write_index: bool = True, 

2275 byteorder: str | None = None, 

2276 time_stamp: datetime.datetime | None = None, 

2277 data_label: str | None = None, 

2278 variable_labels: dict[Hashable, str] | None = None, 

2279 compression: CompressionOptions = "infer", 

2280 storage_options: StorageOptions = None, 

2281 *, 

2282 value_labels: dict[Hashable, dict[float, str]] | None = None, 

2283 ) -> None: 

2284 super().__init__() 

2285 self.data = data 

2286 self._convert_dates = {} if convert_dates is None else convert_dates 

2287 self._write_index = write_index 

2288 self._time_stamp = time_stamp 

2289 self._data_label = data_label 

2290 self._variable_labels = variable_labels 

2291 self._non_cat_value_labels = value_labels 

2292 self._value_labels: list[StataValueLabel] = [] 

2293 self._has_value_labels = np.array([], dtype=bool) 

2294 self._compression = compression 

2295 self._output_file: IO[bytes] | None = None 

2296 self._converted_names: dict[Hashable, str] = {} 

2297 # attach nobs, nvars, data, varlist, typlist 

2298 self._prepare_pandas(data) 

2299 self.storage_options = storage_options 

2300 

2301 if byteorder is None: 

2302 byteorder = sys.byteorder 

2303 self._byteorder = _set_endianness(byteorder) 

2304 self._fname = fname 

2305 self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} 

2306 

2307 def _write(self, to_write: str) -> None: 

2308 """ 

2309 Helper to call encode before writing to file for Python 3 compat. 

2310 """ 

2311 self.handles.handle.write(to_write.encode(self._encoding)) 

2312 

2313 def _write_bytes(self, value: bytes) -> None: 

2314 """ 

2315 Helper to assert file is open before writing. 

2316 """ 

2317 self.handles.handle.write(value) 

2318 

2319 def _prepare_non_cat_value_labels( 

2320 self, data: DataFrame 

2321 ) -> list[StataNonCatValueLabel]: 

2322 """ 

2323 Check for value labels provided for non-categorical columns. Value 

2324 labels 

2325 """ 

2326 non_cat_value_labels: list[StataNonCatValueLabel] = [] 

2327 if self._non_cat_value_labels is None: 

2328 return non_cat_value_labels 

2329 

2330 for labname, labels in self._non_cat_value_labels.items(): 

2331 if labname in self._converted_names: 

2332 colname = self._converted_names[labname] 

2333 elif labname in data.columns: 

2334 colname = str(labname) 

2335 else: 

2336 raise KeyError( 

2337 f"Can't create value labels for {labname}, it wasn't " 

2338 "found in the dataset." 

2339 ) 

2340 

2341 if not is_numeric_dtype(data[colname].dtype): 

2342 # Labels should not be passed explicitly for categorical 

2343 # columns that will be converted to int 

2344 raise ValueError( 

2345 f"Can't create value labels for {labname}, value labels " 

2346 "can only be applied to numeric columns." 

2347 ) 

2348 svl = StataNonCatValueLabel(colname, labels, self._encoding) 

2349 non_cat_value_labels.append(svl) 

2350 return non_cat_value_labels 

2351 

2352 def _prepare_categoricals(self, data: DataFrame) -> DataFrame: 

2353 """ 

2354 Check for categorical columns, retain categorical information for 

2355 Stata file and convert categorical data to int 

2356 """ 

2357 is_cat = [is_categorical_dtype(data[col].dtype) for col in data] 

2358 if not any(is_cat): 

2359 return data 

2360 

2361 self._has_value_labels |= np.array(is_cat) 

2362 

2363 get_base_missing_value = StataMissingValue.get_base_missing_value 

2364 data_formatted = [] 

2365 for col, col_is_cat in zip(data, is_cat): 

2366 if col_is_cat: 

2367 svl = StataValueLabel(data[col], encoding=self._encoding) 

2368 self._value_labels.append(svl) 

2369 dtype = data[col].cat.codes.dtype 

2370 if dtype == np.int64: 

2371 raise ValueError( 

2372 "It is not possible to export " 

2373 "int64-based categorical data to Stata." 

2374 ) 

2375 values = data[col].cat.codes._values.copy() 

2376 

2377 # Upcast if needed so that correct missing values can be set 

2378 if values.max() >= get_base_missing_value(dtype): 

2379 if dtype == np.int8: 

2380 dtype = np.dtype(np.int16) 

2381 elif dtype == np.int16: 

2382 dtype = np.dtype(np.int32) 

2383 else: 

2384 dtype = np.dtype(np.float64) 

2385 values = np.array(values, dtype=dtype) 

2386 

2387 # Replace missing values with Stata missing value for type 

2388 values[values == -1] = get_base_missing_value(dtype) 

2389 data_formatted.append((col, values)) 

2390 else: 

2391 data_formatted.append((col, data[col])) 

2392 return DataFrame.from_dict(dict(data_formatted)) 

2393 

2394 def _replace_nans(self, data: DataFrame) -> DataFrame: 

2395 # return data 

2396 """ 

2397 Checks floating point data columns for nans, and replaces these with 

2398 the generic Stata for missing value (.) 

2399 """ 

2400 for c in data: 

2401 dtype = data[c].dtype 

2402 if dtype in (np.float32, np.float64): 

2403 if dtype == np.float32: 

2404 replacement = self.MISSING_VALUES["f"] 

2405 else: 

2406 replacement = self.MISSING_VALUES["d"] 

2407 data[c] = data[c].fillna(replacement) 

2408 

2409 return data 

2410 

2411 def _update_strl_names(self) -> None: 

2412 """No-op, forward compatibility""" 

2413 pass 

2414 

2415 def _validate_variable_name(self, name: str) -> str: 

2416 """ 

2417 Validate variable names for Stata export. 

2418 

2419 Parameters 

2420 ---------- 

2421 name : str 

2422 Variable name 

2423 

2424 Returns 

2425 ------- 

2426 str 

2427 The validated name with invalid characters replaced with 

2428 underscores. 

2429 

2430 Notes 

2431 ----- 

2432 Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9 

2433 and _. 

2434 """ 

2435 for c in name: 

2436 if ( 

2437 (c < "A" or c > "Z") 

2438 and (c < "a" or c > "z") 

2439 and (c < "0" or c > "9") 

2440 and c != "_" 

2441 ): 

2442 name = name.replace(c, "_") 

2443 return name 

2444 

2445 def _check_column_names(self, data: DataFrame) -> DataFrame: 

2446 """ 

2447 Checks column names to ensure that they are valid Stata column names. 

2448 This includes checks for: 

2449 * Non-string names 

2450 * Stata keywords 

2451 * Variables that start with numbers 

2452 * Variables with names that are too long 

2453 

2454 When an illegal variable name is detected, it is converted, and if 

2455 dates are exported, the variable name is propagated to the date 

2456 conversion dictionary 

2457 """ 

2458 converted_names: dict[Hashable, str] = {} 

2459 columns = list(data.columns) 

2460 original_columns = columns[:] 

2461 

2462 duplicate_var_id = 0 

2463 for j, name in enumerate(columns): 

2464 orig_name = name 

2465 if not isinstance(name, str): 

2466 name = str(name) 

2467 

2468 name = self._validate_variable_name(name) 

2469 

2470 # Variable name must not be a reserved word 

2471 if name in self.RESERVED_WORDS: 

2472 name = "_" + name 

2473 

2474 # Variable name may not start with a number 

2475 if "0" <= name[0] <= "9": 

2476 name = "_" + name 

2477 

2478 name = name[: min(len(name), 32)] 

2479 

2480 if not name == orig_name: 

2481 # check for duplicates 

2482 while columns.count(name) > 0: 

2483 # prepend ascending number to avoid duplicates 

2484 name = "_" + str(duplicate_var_id) + name 

2485 name = name[: min(len(name), 32)] 

2486 duplicate_var_id += 1 

2487 converted_names[orig_name] = name 

2488 

2489 columns[j] = name 

2490 

2491 data.columns = Index(columns) 

2492 

2493 # Check date conversion, and fix key if needed 

2494 if self._convert_dates: 

2495 for c, o in zip(columns, original_columns): 

2496 if c != o: 

2497 self._convert_dates[c] = self._convert_dates[o] 

2498 del self._convert_dates[o] 

2499 

2500 if converted_names: 

2501 conversion_warning = [] 

2502 for orig_name, name in converted_names.items(): 

2503 msg = f"{orig_name} -> {name}" 

2504 conversion_warning.append(msg) 

2505 

2506 ws = invalid_name_doc.format("\n ".join(conversion_warning)) 

2507 warnings.warn( 

2508 ws, 

2509 InvalidColumnName, 

2510 stacklevel=find_stack_level(), 

2511 ) 

2512 

2513 self._converted_names = converted_names 

2514 self._update_strl_names() 

2515 

2516 return data 

2517 

2518 def _set_formats_and_types(self, dtypes: Series) -> None: 

2519 self.fmtlist: list[str] = [] 

2520 self.typlist: list[int] = [] 

2521 for col, dtype in dtypes.items(): 

2522 self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col])) 

2523 self.typlist.append(_dtype_to_stata_type(dtype, self.data[col])) 

2524 

2525 def _prepare_pandas(self, data: DataFrame) -> None: 

2526 # NOTE: we might need a different API / class for pandas objects so 

2527 # we can set different semantics - handle this with a PR to pandas.io 

2528 

2529 data = data.copy() 

2530 

2531 if self._write_index: 

2532 temp = data.reset_index() 

2533 if isinstance(temp, DataFrame): 

2534 data = temp 

2535 

2536 # Ensure column names are strings 

2537 data = self._check_column_names(data) 

2538 

2539 # Check columns for compatibility with stata, upcast if necessary 

2540 # Raise if outside the supported range 

2541 data = _cast_to_stata_types(data) 

2542 

2543 # Replace NaNs with Stata missing values 

2544 data = self._replace_nans(data) 

2545 

2546 # Set all columns to initially unlabelled 

2547 self._has_value_labels = np.repeat(False, data.shape[1]) 

2548 

2549 # Create value labels for non-categorical data 

2550 non_cat_value_labels = self._prepare_non_cat_value_labels(data) 

2551 

2552 non_cat_columns = [svl.labname for svl in non_cat_value_labels] 

2553 has_non_cat_val_labels = data.columns.isin(non_cat_columns) 

2554 self._has_value_labels |= has_non_cat_val_labels 

2555 self._value_labels.extend(non_cat_value_labels) 

2556 

2557 # Convert categoricals to int data, and strip labels 

2558 data = self._prepare_categoricals(data) 

2559 

2560 self.nobs, self.nvar = data.shape 

2561 self.data = data 

2562 self.varlist = data.columns.tolist() 

2563 

2564 dtypes = data.dtypes 

2565 

2566 # Ensure all date columns are converted 

2567 for col in data: 

2568 if col in self._convert_dates: 

2569 continue 

2570 if is_datetime64_dtype(data[col]): 

2571 self._convert_dates[col] = "tc" 

2572 

2573 self._convert_dates = _maybe_convert_to_int_keys( 

2574 self._convert_dates, self.varlist 

2575 ) 

2576 for key in self._convert_dates: 

2577 new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) 

2578 dtypes[key] = np.dtype(new_type) 

2579 

2580 # Verify object arrays are strings and encode to bytes 

2581 self._encode_strings() 

2582 

2583 self._set_formats_and_types(dtypes) 

2584 

2585 # set the given format for the datetime cols 

2586 if self._convert_dates is not None: 

2587 for key in self._convert_dates: 

2588 if isinstance(key, int): 

2589 self.fmtlist[key] = self._convert_dates[key] 

2590 

2591 def _encode_strings(self) -> None: 

2592 """ 

2593 Encode strings in dta-specific encoding 

2594 

2595 Do not encode columns marked for date conversion or for strL 

2596 conversion. The strL converter independently handles conversion and 

2597 also accepts empty string arrays. 

2598 """ 

2599 convert_dates = self._convert_dates 

2600 # _convert_strl is not available in dta 114 

2601 convert_strl = getattr(self, "_convert_strl", []) 

2602 for i, col in enumerate(self.data): 

2603 # Skip columns marked for date conversion or strl conversion 

2604 if i in convert_dates or col in convert_strl: 

2605 continue 

2606 column = self.data[col] 

2607 dtype = column.dtype 

2608 if dtype.type is np.object_: 

2609 inferred_dtype = infer_dtype(column, skipna=True) 

2610 if not ((inferred_dtype == "string") or len(column) == 0): 

2611 col = column.name 

2612 raise ValueError( 

2613 f"""\ 

2614Column `{col}` cannot be exported.\n\nOnly string-like object arrays 

2615containing all strings or a mix of strings and None can be exported. 

2616Object arrays containing only null values are prohibited. Other object 

2617types cannot be exported and must first be converted to one of the 

2618supported types.""" 

2619 ) 

2620 encoded = self.data[col].str.encode(self._encoding) 

2621 # If larger than _max_string_length do nothing 

2622 if ( 

2623 max_len_string_array(ensure_object(encoded._values)) 

2624 <= self._max_string_length 

2625 ): 

2626 self.data[col] = encoded 

2627 

2628 def write_file(self) -> None: 

2629 """ 

2630 Export DataFrame object to Stata dta format. 

2631 """ 

2632 with get_handle( 

2633 self._fname, 

2634 "wb", 

2635 compression=self._compression, 

2636 is_text=False, 

2637 storage_options=self.storage_options, 

2638 ) as self.handles: 

2639 

2640 if self.handles.compression["method"] is not None: 

2641 # ZipFile creates a file (with the same name) for each write call. 

2642 # Write it first into a buffer and then write the buffer to the ZipFile. 

2643 self._output_file, self.handles.handle = self.handles.handle, BytesIO() 

2644 self.handles.created_handles.append(self.handles.handle) 

2645 

2646 try: 

2647 self._write_header( 

2648 data_label=self._data_label, time_stamp=self._time_stamp 

2649 ) 

2650 self._write_map() 

2651 self._write_variable_types() 

2652 self._write_varnames() 

2653 self._write_sortlist() 

2654 self._write_formats() 

2655 self._write_value_label_names() 

2656 self._write_variable_labels() 

2657 self._write_expansion_fields() 

2658 self._write_characteristics() 

2659 records = self._prepare_data() 

2660 self._write_data(records) 

2661 self._write_strls() 

2662 self._write_value_labels() 

2663 self._write_file_close_tag() 

2664 self._write_map() 

2665 self._close() 

2666 except Exception as exc: 

2667 self.handles.close() 

2668 if isinstance(self._fname, (str, os.PathLike)) and os.path.isfile( 

2669 self._fname 

2670 ): 

2671 try: 

2672 os.unlink(self._fname) 

2673 except OSError: 

2674 warnings.warn( 

2675 f"This save was not successful but {self._fname} could not " 

2676 "be deleted. This file is not valid.", 

2677 ResourceWarning, 

2678 stacklevel=find_stack_level(), 

2679 ) 

2680 raise exc 

2681 

2682 def _close(self) -> None: 

2683 """ 

2684 Close the file if it was created by the writer. 

2685 

2686 If a buffer or file-like object was passed in, for example a GzipFile, 

2687 then leave this file open for the caller to close. 

2688 """ 

2689 # write compression 

2690 if self._output_file is not None: 

2691 assert isinstance(self.handles.handle, BytesIO) 

2692 bio, self.handles.handle = self.handles.handle, self._output_file 

2693 self.handles.handle.write(bio.getvalue()) 

2694 

2695 def _write_map(self) -> None: 

2696 """No-op, future compatibility""" 

2697 pass 

2698 

2699 def _write_file_close_tag(self) -> None: 

2700 """No-op, future compatibility""" 

2701 pass 

2702 

2703 def _write_characteristics(self) -> None: 

2704 """No-op, future compatibility""" 

2705 pass 

2706 

2707 def _write_strls(self) -> None: 

2708 """No-op, future compatibility""" 

2709 pass 

2710 

2711 def _write_expansion_fields(self) -> None: 

2712 """Write 5 zeros for expansion fields""" 

2713 self._write(_pad_bytes("", 5)) 

2714 

2715 def _write_value_labels(self) -> None: 

2716 for vl in self._value_labels: 

2717 self._write_bytes(vl.generate_value_label(self._byteorder)) 

2718 

2719 def _write_header( 

2720 self, 

2721 data_label: str | None = None, 

2722 time_stamp: datetime.datetime | None = None, 

2723 ) -> None: 

2724 byteorder = self._byteorder 

2725 # ds_format - just use 114 

2726 self._write_bytes(struct.pack("b", 114)) 

2727 # byteorder 

2728 self._write(byteorder == ">" and "\x01" or "\x02") 

2729 # filetype 

2730 self._write("\x01") 

2731 # unused 

2732 self._write("\x00") 

2733 # number of vars, 2 bytes 

2734 self._write_bytes(struct.pack(byteorder + "h", self.nvar)[:2]) 

2735 # number of obs, 4 bytes 

2736 self._write_bytes(struct.pack(byteorder + "i", self.nobs)[:4]) 

2737 # data label 81 bytes, char, null terminated 

2738 if data_label is None: 

2739 self._write_bytes(self._null_terminate_bytes(_pad_bytes("", 80))) 

2740 else: 

2741 self._write_bytes( 

2742 self._null_terminate_bytes(_pad_bytes(data_label[:80], 80)) 

2743 ) 

2744 # time stamp, 18 bytes, char, null terminated 

2745 # format dd Mon yyyy hh:mm 

2746 if time_stamp is None: 

2747 time_stamp = datetime.datetime.now() 

2748 elif not isinstance(time_stamp, datetime.datetime): 

2749 raise ValueError("time_stamp should be datetime type") 

2750 # GH #13856 

2751 # Avoid locale-specific month conversion 

2752 months = [ 

2753 "Jan", 

2754 "Feb", 

2755 "Mar", 

2756 "Apr", 

2757 "May", 

2758 "Jun", 

2759 "Jul", 

2760 "Aug", 

2761 "Sep", 

2762 "Oct", 

2763 "Nov", 

2764 "Dec", 

2765 ] 

2766 month_lookup = {i + 1: month for i, month in enumerate(months)} 

2767 ts = ( 

2768 time_stamp.strftime("%d ") 

2769 + month_lookup[time_stamp.month] 

2770 + time_stamp.strftime(" %Y %H:%M") 

2771 ) 

2772 self._write_bytes(self._null_terminate_bytes(ts)) 

2773 

2774 def _write_variable_types(self) -> None: 

2775 for typ in self.typlist: 

2776 self._write_bytes(struct.pack("B", typ)) 

2777 

2778 def _write_varnames(self) -> None: 

2779 # varlist names are checked by _check_column_names 

2780 # varlist, requires null terminated 

2781 for name in self.varlist: 

2782 name = self._null_terminate_str(name) 

2783 name = _pad_bytes(name[:32], 33) 

2784 self._write(name) 

2785 

2786 def _write_sortlist(self) -> None: 

2787 # srtlist, 2*(nvar+1), int array, encoded by byteorder 

2788 srtlist = _pad_bytes("", 2 * (self.nvar + 1)) 

2789 self._write(srtlist) 

2790 

2791 def _write_formats(self) -> None: 

2792 # fmtlist, 49*nvar, char array 

2793 for fmt in self.fmtlist: 

2794 self._write(_pad_bytes(fmt, 49)) 

2795 

2796 def _write_value_label_names(self) -> None: 

2797 # lbllist, 33*nvar, char array 

2798 for i in range(self.nvar): 

2799 # Use variable name when categorical 

2800 if self._has_value_labels[i]: 

2801 name = self.varlist[i] 

2802 name = self._null_terminate_str(name) 

2803 name = _pad_bytes(name[:32], 33) 

2804 self._write(name) 

2805 else: # Default is empty label 

2806 self._write(_pad_bytes("", 33)) 

2807 

2808 def _write_variable_labels(self) -> None: 

2809 # Missing labels are 80 blank characters plus null termination 

2810 blank = _pad_bytes("", 81) 

2811 

2812 if self._variable_labels is None: 

2813 for i in range(self.nvar): 

2814 self._write(blank) 

2815 return 

2816 

2817 for col in self.data: 

2818 if col in self._variable_labels: 

2819 label = self._variable_labels[col] 

2820 if len(label) > 80: 

2821 raise ValueError("Variable labels must be 80 characters or fewer") 

2822 is_latin1 = all(ord(c) < 256 for c in label) 

2823 if not is_latin1: 

2824 raise ValueError( 

2825 "Variable labels must contain only characters that " 

2826 "can be encoded in Latin-1" 

2827 ) 

2828 self._write(_pad_bytes(label, 81)) 

2829 else: 

2830 self._write(blank) 

2831 

2832 def _convert_strls(self, data: DataFrame) -> DataFrame: 

2833 """No-op, future compatibility""" 

2834 return data 

2835 

2836 def _prepare_data(self) -> np.recarray: 

2837 data = self.data 

2838 typlist = self.typlist 

2839 convert_dates = self._convert_dates 

2840 # 1. Convert dates 

2841 if self._convert_dates is not None: 

2842 for i, col in enumerate(data): 

2843 if i in convert_dates: 

2844 data[col] = _datetime_to_stata_elapsed_vec( 

2845 data[col], self.fmtlist[i] 

2846 ) 

2847 # 2. Convert strls 

2848 data = self._convert_strls(data) 

2849 

2850 # 3. Convert bad string data to '' and pad to correct length 

2851 dtypes = {} 

2852 native_byteorder = self._byteorder == _set_endianness(sys.byteorder) 

2853 for i, col in enumerate(data): 

2854 typ = typlist[i] 

2855 if typ <= self._max_string_length: 

2856 data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) 

2857 stype = f"S{typ}" 

2858 dtypes[col] = stype 

2859 data[col] = data[col].astype(stype) 

2860 else: 

2861 dtype = data[col].dtype 

2862 if not native_byteorder: 

2863 dtype = dtype.newbyteorder(self._byteorder) 

2864 dtypes[col] = dtype 

2865 

2866 return data.to_records(index=False, column_dtypes=dtypes) 

2867 

2868 def _write_data(self, records: np.recarray) -> None: 

2869 self._write_bytes(records.tobytes()) 

2870 

2871 @staticmethod 

2872 def _null_terminate_str(s: str) -> str: 

2873 s += "\x00" 

2874 return s 

2875 

2876 def _null_terminate_bytes(self, s: str) -> bytes: 

2877 return self._null_terminate_str(s).encode(self._encoding) 

2878 

2879 

2880def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) -> int: 

2881 """ 

2882 Converts dtype types to stata types. Returns the byte of the given ordinal. 

2883 See TYPE_MAP and comments for an explanation. This is also explained in 

2884 the dta spec. 

2885 1 - 2045 are strings of this length 

2886 Pandas Stata 

2887 32768 - for object strL 

2888 65526 - for int8 byte 

2889 65527 - for int16 int 

2890 65528 - for int32 long 

2891 65529 - for float32 float 

2892 65530 - for double double 

2893 

2894 If there are dates to convert, then dtype will already have the correct 

2895 type inserted. 

2896 """ 

2897 # TODO: expand to handle datetime to integer conversion 

2898 if force_strl: 

2899 return 32768 

2900 if dtype.type is np.object_: # try to coerce it to the biggest string 

2901 # not memory efficient, what else could we 

2902 # do? 

2903 itemsize = max_len_string_array(ensure_object(column._values)) 

2904 itemsize = max(itemsize, 1) 

2905 if itemsize <= 2045: 

2906 return itemsize 

2907 return 32768 

2908 elif dtype.type is np.float64: 

2909 return 65526 

2910 elif dtype.type is np.float32: 

2911 return 65527 

2912 elif dtype.type is np.int32: 

2913 return 65528 

2914 elif dtype.type is np.int16: 

2915 return 65529 

2916 elif dtype.type is np.int8: 

2917 return 65530 

2918 else: # pragma : no cover 

2919 raise NotImplementedError(f"Data type {dtype} not supported.") 

2920 

2921 

2922def _pad_bytes_new(name: str | bytes, length: int) -> bytes: 

2923 """ 

2924 Takes a bytes instance and pads it with null bytes until it's length chars. 

2925 """ 

2926 if isinstance(name, str): 

2927 name = bytes(name, "utf-8") 

2928 return name + b"\x00" * (length - len(name)) 

2929 

2930 

2931class StataStrLWriter: 

2932 """ 

2933 Converter for Stata StrLs 

2934 

2935 Stata StrLs map 8 byte values to strings which are stored using a 

2936 dictionary-like format where strings are keyed to two values. 

2937 

2938 Parameters 

2939 ---------- 

2940 df : DataFrame 

2941 DataFrame to convert 

2942 columns : Sequence[str] 

2943 List of columns names to convert to StrL 

2944 version : int, optional 

2945 dta version. Currently supports 117, 118 and 119 

2946 byteorder : str, optional 

2947 Can be ">", "<", "little", or "big". default is `sys.byteorder` 

2948 

2949 Notes 

2950 ----- 

2951 Supports creation of the StrL block of a dta file for dta versions 

2952 117, 118 and 119. These differ in how the GSO is stored. 118 and 

2953 119 store the GSO lookup value as a uint32 and a uint64, while 117 

2954 uses two uint32s. 118 and 119 also encode all strings as unicode 

2955 which is required by the format. 117 uses 'latin-1' a fixed width 

2956 encoding that extends the 7-bit ascii table with an additional 128 

2957 characters. 

2958 """ 

2959 

2960 def __init__( 

2961 self, 

2962 df: DataFrame, 

2963 columns: Sequence[str], 

2964 version: int = 117, 

2965 byteorder: str | None = None, 

2966 ) -> None: 

2967 if version not in (117, 118, 119): 

2968 raise ValueError("Only dta versions 117, 118 and 119 supported") 

2969 self._dta_ver = version 

2970 

2971 self.df = df 

2972 self.columns = columns 

2973 self._gso_table = {"": (0, 0)} 

2974 if byteorder is None: 

2975 byteorder = sys.byteorder 

2976 self._byteorder = _set_endianness(byteorder) 

2977 

2978 gso_v_type = "I" # uint32 

2979 gso_o_type = "Q" # uint64 

2980 self._encoding = "utf-8" 

2981 if version == 117: 

2982 o_size = 4 

2983 gso_o_type = "I" # 117 used uint32 

2984 self._encoding = "latin-1" 

2985 elif version == 118: 

2986 o_size = 6 

2987 else: # version == 119 

2988 o_size = 5 

2989 self._o_offet = 2 ** (8 * (8 - o_size)) 

2990 self._gso_o_type = gso_o_type 

2991 self._gso_v_type = gso_v_type 

2992 

2993 def _convert_key(self, key: tuple[int, int]) -> int: 

2994 v, o = key 

2995 return v + self._o_offet * o 

2996 

2997 def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]: 

2998 """ 

2999 Generates the GSO lookup table for the DataFrame 

3000 

3001 Returns 

3002 ------- 

3003 gso_table : dict 

3004 Ordered dictionary using the string found as keys 

3005 and their lookup position (v,o) as values 

3006 gso_df : DataFrame 

3007 DataFrame where strl columns have been converted to 

3008 (v,o) values 

3009 

3010 Notes 

3011 ----- 

3012 Modifies the DataFrame in-place. 

3013 

3014 The DataFrame returned encodes the (v,o) values as uint64s. The 

3015 encoding depends on the dta version, and can be expressed as 

3016 

3017 enc = v + o * 2 ** (o_size * 8) 

3018 

3019 so that v is stored in the lower bits and o is in the upper 

3020 bits. o_size is 

3021 

3022 * 117: 4 

3023 * 118: 6 

3024 * 119: 5 

3025 """ 

3026 gso_table = self._gso_table 

3027 gso_df = self.df 

3028 columns = list(gso_df.columns) 

3029 selected = gso_df[self.columns] 

3030 col_index = [(col, columns.index(col)) for col in self.columns] 

3031 keys = np.empty(selected.shape, dtype=np.uint64) 

3032 for o, (idx, row) in enumerate(selected.iterrows()): 

3033 for j, (col, v) in enumerate(col_index): 

3034 val = row[col] 

3035 # Allow columns with mixed str and None (GH 23633) 

3036 val = "" if val is None else val 

3037 key = gso_table.get(val, None) 

3038 if key is None: 

3039 # Stata prefers human numbers 

3040 key = (v + 1, o + 1) 

3041 gso_table[val] = key 

3042 keys[o, j] = self._convert_key(key) 

3043 for i, col in enumerate(self.columns): 

3044 gso_df[col] = keys[:, i] 

3045 

3046 return gso_table, gso_df 

3047 

3048 def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes: 

3049 """ 

3050 Generates the binary blob of GSOs that is written to the dta file. 

3051 

3052 Parameters 

3053 ---------- 

3054 gso_table : dict 

3055 Ordered dictionary (str, vo) 

3056 

3057 Returns 

3058 ------- 

3059 gso : bytes 

3060 Binary content of dta file to be placed between strl tags 

3061 

3062 Notes 

3063 ----- 

3064 Output format depends on dta version. 117 uses two uint32s to 

3065 express v and o while 118+ uses a uint32 for v and a uint64 for o. 

3066 """ 

3067 # Format information 

3068 # Length includes null term 

3069 # 117 

3070 # GSOvvvvooootllllxxxxxxxxxxxxxxx...x 

3071 # 3 u4 u4 u1 u4 string + null term 

3072 # 

3073 # 118, 119 

3074 # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x 

3075 # 3 u4 u8 u1 u4 string + null term 

3076 

3077 bio = BytesIO() 

3078 gso = bytes("GSO", "ascii") 

3079 gso_type = struct.pack(self._byteorder + "B", 130) 

3080 null = struct.pack(self._byteorder + "B", 0) 

3081 v_type = self._byteorder + self._gso_v_type 

3082 o_type = self._byteorder + self._gso_o_type 

3083 len_type = self._byteorder + "I" 

3084 for strl, vo in gso_table.items(): 

3085 if vo == (0, 0): 

3086 continue 

3087 v, o = vo 

3088 

3089 # GSO 

3090 bio.write(gso) 

3091 

3092 # vvvv 

3093 bio.write(struct.pack(v_type, v)) 

3094 

3095 # oooo / oooooooo 

3096 bio.write(struct.pack(o_type, o)) 

3097 

3098 # t 

3099 bio.write(gso_type) 

3100 

3101 # llll 

3102 utf8_string = bytes(strl, "utf-8") 

3103 bio.write(struct.pack(len_type, len(utf8_string) + 1)) 

3104 

3105 # xxx...xxx 

3106 bio.write(utf8_string) 

3107 bio.write(null) 

3108 

3109 return bio.getvalue() 

3110 

3111 

3112class StataWriter117(StataWriter): 

3113 """ 

3114 A class for writing Stata binary dta files in Stata 13 format (117) 

3115 

3116 Parameters 

3117 ---------- 

3118 fname : path (string), buffer or path object 

3119 string, path object (pathlib.Path or py._path.local.LocalPath) or 

3120 object implementing a binary write() functions. If using a buffer 

3121 then the buffer will not be automatically closed after the file 

3122 is written. 

3123 data : DataFrame 

3124 Input to save 

3125 convert_dates : dict 

3126 Dictionary mapping columns containing datetime types to stata internal 

3127 format to use when writing the dates. Options are 'tc', 'td', 'tm', 

3128 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. 

3129 Datetime columns that do not have a conversion type specified will be 

3130 converted to 'tc'. Raises NotImplementedError if a datetime column has 

3131 timezone information 

3132 write_index : bool 

3133 Write the index to Stata dataset. 

3134 byteorder : str 

3135 Can be ">", "<", "little", or "big". default is `sys.byteorder` 

3136 time_stamp : datetime 

3137 A datetime to use as file creation date. Default is the current time 

3138 data_label : str 

3139 A label for the data set. Must be 80 characters or smaller. 

3140 variable_labels : dict 

3141 Dictionary containing columns as keys and variable labels as values. 

3142 Each label must be 80 characters or smaller. 

3143 convert_strl : list 

3144 List of columns names to convert to Stata StrL format. Columns with 

3145 more than 2045 characters are automatically written as StrL. 

3146 Smaller columns can be converted by including the column name. Using 

3147 StrLs can reduce output file size when strings are longer than 8 

3148 characters, and either frequently repeated or sparse. 

3149 {compression_options} 

3150 

3151 .. versionadded:: 1.1.0 

3152 

3153 .. versionchanged:: 1.4.0 Zstandard support. 

3154 

3155 value_labels : dict of dicts 

3156 Dictionary containing columns as keys and dictionaries of column value 

3157 to labels as values. The combined length of all labels for a single 

3158 variable must be 32,000 characters or smaller. 

3159 

3160 .. versionadded:: 1.4.0 

3161 

3162 Returns 

3163 ------- 

3164 writer : StataWriter117 instance 

3165 The StataWriter117 instance has a write_file method, which will 

3166 write the file to the given `fname`. 

3167 

3168 Raises 

3169 ------ 

3170 NotImplementedError 

3171 * If datetimes contain timezone information 

3172 ValueError 

3173 * Columns listed in convert_dates are neither datetime64[ns] 

3174 or datetime.datetime 

3175 * Column dtype is not representable in Stata 

3176 * Column listed in convert_dates is not in DataFrame 

3177 * Categorical label contains more than 32,000 characters 

3178 

3179 Examples 

3180 -------- 

3181 >>> data = pd.DataFrame([[1.0, 1, 'a']], columns=['a', 'b', 'c']) 

3182 >>> writer = pd.io.stata.StataWriter117('./data_file.dta', data) 

3183 >>> writer.write_file() 

3184 

3185 Directly write a zip file 

3186 >>> compression = {"method": "zip", "archive_name": "data_file.dta"} 

3187 >>> writer = pd.io.stata.StataWriter117( 

3188 ... './data_file.zip', data, compression=compression 

3189 ... ) 

3190 >>> writer.write_file() 

3191 

3192 Or with long strings stored in strl format 

3193 >>> data = pd.DataFrame([['A relatively long string'], [''], ['']], 

3194 ... columns=['strls']) 

3195 >>> writer = pd.io.stata.StataWriter117( 

3196 ... './data_file_with_long_strings.dta', data, convert_strl=['strls']) 

3197 >>> writer.write_file() 

3198 """ 

3199 

3200 _max_string_length = 2045 

3201 _dta_version = 117 

3202 

3203 def __init__( 

3204 self, 

3205 fname: FilePath | WriteBuffer[bytes], 

3206 data: DataFrame, 

3207 convert_dates: dict[Hashable, str] | None = None, 

3208 write_index: bool = True, 

3209 byteorder: str | None = None, 

3210 time_stamp: datetime.datetime | None = None, 

3211 data_label: str | None = None, 

3212 variable_labels: dict[Hashable, str] | None = None, 

3213 convert_strl: Sequence[Hashable] | None = None, 

3214 compression: CompressionOptions = "infer", 

3215 storage_options: StorageOptions = None, 

3216 *, 

3217 value_labels: dict[Hashable, dict[float, str]] | None = None, 

3218 ) -> None: 

3219 # Copy to new list since convert_strl might be modified later 

3220 self._convert_strl: list[Hashable] = [] 

3221 if convert_strl is not None: 

3222 self._convert_strl.extend(convert_strl) 

3223 

3224 super().__init__( 

3225 fname, 

3226 data, 

3227 convert_dates, 

3228 write_index, 

3229 byteorder=byteorder, 

3230 time_stamp=time_stamp, 

3231 data_label=data_label, 

3232 variable_labels=variable_labels, 

3233 value_labels=value_labels, 

3234 compression=compression, 

3235 storage_options=storage_options, 

3236 ) 

3237 self._map: dict[str, int] = {} 

3238 self._strl_blob = b"" 

3239 

3240 @staticmethod 

3241 def _tag(val: str | bytes, tag: str) -> bytes: 

3242 """Surround val with <tag></tag>""" 

3243 if isinstance(val, str): 

3244 val = bytes(val, "utf-8") 

3245 return bytes("<" + tag + ">", "utf-8") + val + bytes("</" + tag + ">", "utf-8") 

3246 

3247 def _update_map(self, tag: str) -> None: 

3248 """Update map location for tag with file position""" 

3249 assert self.handles.handle is not None 

3250 self._map[tag] = self.handles.handle.tell() 

3251 

3252 def _write_header( 

3253 self, 

3254 data_label: str | None = None, 

3255 time_stamp: datetime.datetime | None = None, 

3256 ) -> None: 

3257 """Write the file header""" 

3258 byteorder = self._byteorder 

3259 self._write_bytes(bytes("<stata_dta>", "utf-8")) 

3260 bio = BytesIO() 

3261 # ds_format - 117 

3262 bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) 

3263 # byteorder 

3264 bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) 

3265 # number of vars, 2 bytes in 117 and 118, 4 byte in 119 

3266 nvar_type = "H" if self._dta_version <= 118 else "I" 

3267 bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) 

3268 # 117 uses 4 bytes, 118 uses 8 

3269 nobs_size = "I" if self._dta_version == 117 else "Q" 

3270 bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N")) 

3271 # data label 81 bytes, char, null terminated 

3272 label = data_label[:80] if data_label is not None else "" 

3273 encoded_label = label.encode(self._encoding) 

3274 label_size = "B" if self._dta_version == 117 else "H" 

3275 label_len = struct.pack(byteorder + label_size, len(encoded_label)) 

3276 encoded_label = label_len + encoded_label 

3277 bio.write(self._tag(encoded_label, "label")) 

3278 # time stamp, 18 bytes, char, null terminated 

3279 # format dd Mon yyyy hh:mm 

3280 if time_stamp is None: 

3281 time_stamp = datetime.datetime.now() 

3282 elif not isinstance(time_stamp, datetime.datetime): 

3283 raise ValueError("time_stamp should be datetime type") 

3284 # Avoid locale-specific month conversion 

3285 months = [ 

3286 "Jan", 

3287 "Feb", 

3288 "Mar", 

3289 "Apr", 

3290 "May", 

3291 "Jun", 

3292 "Jul", 

3293 "Aug", 

3294 "Sep", 

3295 "Oct", 

3296 "Nov", 

3297 "Dec", 

3298 ] 

3299 month_lookup = {i + 1: month for i, month in enumerate(months)} 

3300 ts = ( 

3301 time_stamp.strftime("%d ") 

3302 + month_lookup[time_stamp.month] 

3303 + time_stamp.strftime(" %Y %H:%M") 

3304 ) 

3305 # '\x11' added due to inspection of Stata file 

3306 stata_ts = b"\x11" + bytes(ts, "utf-8") 

3307 bio.write(self._tag(stata_ts, "timestamp")) 

3308 self._write_bytes(self._tag(bio.getvalue(), "header")) 

3309 

3310 def _write_map(self) -> None: 

3311 """ 

3312 Called twice during file write. The first populates the values in 

3313 the map with 0s. The second call writes the final map locations when 

3314 all blocks have been written. 

3315 """ 

3316 if not self._map: 

3317 self._map = { 

3318 "stata_data": 0, 

3319 "map": self.handles.handle.tell(), 

3320 "variable_types": 0, 

3321 "varnames": 0, 

3322 "sortlist": 0, 

3323 "formats": 0, 

3324 "value_label_names": 0, 

3325 "variable_labels": 0, 

3326 "characteristics": 0, 

3327 "data": 0, 

3328 "strls": 0, 

3329 "value_labels": 0, 

3330 "stata_data_close": 0, 

3331 "end-of-file": 0, 

3332 } 

3333 # Move to start of map 

3334 self.handles.handle.seek(self._map["map"]) 

3335 bio = BytesIO() 

3336 for val in self._map.values(): 

3337 bio.write(struct.pack(self._byteorder + "Q", val)) 

3338 self._write_bytes(self._tag(bio.getvalue(), "map")) 

3339 

3340 def _write_variable_types(self) -> None: 

3341 self._update_map("variable_types") 

3342 bio = BytesIO() 

3343 for typ in self.typlist: 

3344 bio.write(struct.pack(self._byteorder + "H", typ)) 

3345 self._write_bytes(self._tag(bio.getvalue(), "variable_types")) 

3346 

3347 def _write_varnames(self) -> None: 

3348 self._update_map("varnames") 

3349 bio = BytesIO() 

3350 # 118 scales by 4 to accommodate utf-8 data worst case encoding 

3351 vn_len = 32 if self._dta_version == 117 else 128 

3352 for name in self.varlist: 

3353 name = self._null_terminate_str(name) 

3354 name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1) 

3355 bio.write(name) 

3356 self._write_bytes(self._tag(bio.getvalue(), "varnames")) 

3357 

3358 def _write_sortlist(self) -> None: 

3359 self._update_map("sortlist") 

3360 sort_size = 2 if self._dta_version < 119 else 4 

3361 self._write_bytes(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist")) 

3362 

3363 def _write_formats(self) -> None: 

3364 self._update_map("formats") 

3365 bio = BytesIO() 

3366 fmt_len = 49 if self._dta_version == 117 else 57 

3367 for fmt in self.fmtlist: 

3368 bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len)) 

3369 self._write_bytes(self._tag(bio.getvalue(), "formats")) 

3370 

3371 def _write_value_label_names(self) -> None: 

3372 self._update_map("value_label_names") 

3373 bio = BytesIO() 

3374 # 118 scales by 4 to accommodate utf-8 data worst case encoding 

3375 vl_len = 32 if self._dta_version == 117 else 128 

3376 for i in range(self.nvar): 

3377 # Use variable name when categorical 

3378 name = "" # default name 

3379 if self._has_value_labels[i]: 

3380 name = self.varlist[i] 

3381 name = self._null_terminate_str(name) 

3382 encoded_name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1) 

3383 bio.write(encoded_name) 

3384 self._write_bytes(self._tag(bio.getvalue(), "value_label_names")) 

3385 

3386 def _write_variable_labels(self) -> None: 

3387 # Missing labels are 80 blank characters plus null termination 

3388 self._update_map("variable_labels") 

3389 bio = BytesIO() 

3390 # 118 scales by 4 to accommodate utf-8 data worst case encoding 

3391 vl_len = 80 if self._dta_version == 117 else 320 

3392 blank = _pad_bytes_new("", vl_len + 1) 

3393 

3394 if self._variable_labels is None: 

3395 for _ in range(self.nvar): 

3396 bio.write(blank) 

3397 self._write_bytes(self._tag(bio.getvalue(), "variable_labels")) 

3398 return 

3399 

3400 for col in self.data: 

3401 if col in self._variable_labels: 

3402 label = self._variable_labels[col] 

3403 if len(label) > 80: 

3404 raise ValueError("Variable labels must be 80 characters or fewer") 

3405 try: 

3406 encoded = label.encode(self._encoding) 

3407 except UnicodeEncodeError as err: 

3408 raise ValueError( 

3409 "Variable labels must contain only characters that " 

3410 f"can be encoded in {self._encoding}" 

3411 ) from err 

3412 

3413 bio.write(_pad_bytes_new(encoded, vl_len + 1)) 

3414 else: 

3415 bio.write(blank) 

3416 self._write_bytes(self._tag(bio.getvalue(), "variable_labels")) 

3417 

3418 def _write_characteristics(self) -> None: 

3419 self._update_map("characteristics") 

3420 self._write_bytes(self._tag(b"", "characteristics")) 

3421 

3422 def _write_data(self, records) -> None: 

3423 self._update_map("data") 

3424 self._write_bytes(b"<data>") 

3425 self._write_bytes(records.tobytes()) 

3426 self._write_bytes(b"</data>") 

3427 

3428 def _write_strls(self) -> None: 

3429 self._update_map("strls") 

3430 self._write_bytes(self._tag(self._strl_blob, "strls")) 

3431 

3432 def _write_expansion_fields(self) -> None: 

3433 """No-op in dta 117+""" 

3434 pass 

3435 

3436 def _write_value_labels(self) -> None: 

3437 self._update_map("value_labels") 

3438 bio = BytesIO() 

3439 for vl in self._value_labels: 

3440 lab = vl.generate_value_label(self._byteorder) 

3441 lab = self._tag(lab, "lbl") 

3442 bio.write(lab) 

3443 self._write_bytes(self._tag(bio.getvalue(), "value_labels")) 

3444 

3445 def _write_file_close_tag(self) -> None: 

3446 self._update_map("stata_data_close") 

3447 self._write_bytes(bytes("</stata_dta>", "utf-8")) 

3448 self._update_map("end-of-file") 

3449 

3450 def _update_strl_names(self) -> None: 

3451 """ 

3452 Update column names for conversion to strl if they might have been 

3453 changed to comply with Stata naming rules 

3454 """ 

3455 # Update convert_strl if names changed 

3456 for orig, new in self._converted_names.items(): 

3457 if orig in self._convert_strl: 

3458 idx = self._convert_strl.index(orig) 

3459 self._convert_strl[idx] = new 

3460 

3461 def _convert_strls(self, data: DataFrame) -> DataFrame: 

3462 """ 

3463 Convert columns to StrLs if either very large or in the 

3464 convert_strl variable 

3465 """ 

3466 convert_cols = [ 

3467 col 

3468 for i, col in enumerate(data) 

3469 if self.typlist[i] == 32768 or col in self._convert_strl 

3470 ] 

3471 

3472 if convert_cols: 

3473 ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) 

3474 tab, new_data = ssw.generate_table() 

3475 data = new_data 

3476 self._strl_blob = ssw.generate_blob(tab) 

3477 return data 

3478 

3479 def _set_formats_and_types(self, dtypes: Series) -> None: 

3480 self.typlist = [] 

3481 self.fmtlist = [] 

3482 for col, dtype in dtypes.items(): 

3483 force_strl = col in self._convert_strl 

3484 fmt = _dtype_to_default_stata_fmt( 

3485 dtype, 

3486 self.data[col], 

3487 dta_version=self._dta_version, 

3488 force_strl=force_strl, 

3489 ) 

3490 self.fmtlist.append(fmt) 

3491 self.typlist.append( 

3492 _dtype_to_stata_type_117(dtype, self.data[col], force_strl) 

3493 ) 

3494 

3495 

3496class StataWriterUTF8(StataWriter117): 

3497 """ 

3498 Stata binary dta file writing in Stata 15 (118) and 16 (119) formats 

3499 

3500 DTA 118 and 119 format files support unicode string data (both fixed 

3501 and strL) format. Unicode is also supported in value labels, variable 

3502 labels and the dataset label. Format 119 is automatically used if the 

3503 file contains more than 32,767 variables. 

3504 

3505 .. versionadded:: 1.0.0 

3506 

3507 Parameters 

3508 ---------- 

3509 fname : path (string), buffer or path object 

3510 string, path object (pathlib.Path or py._path.local.LocalPath) or 

3511 object implementing a binary write() functions. If using a buffer 

3512 then the buffer will not be automatically closed after the file 

3513 is written. 

3514 data : DataFrame 

3515 Input to save 

3516 convert_dates : dict, default None 

3517 Dictionary mapping columns containing datetime types to stata internal 

3518 format to use when writing the dates. Options are 'tc', 'td', 'tm', 

3519 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. 

3520 Datetime columns that do not have a conversion type specified will be 

3521 converted to 'tc'. Raises NotImplementedError if a datetime column has 

3522 timezone information 

3523 write_index : bool, default True 

3524 Write the index to Stata dataset. 

3525 byteorder : str, default None 

3526 Can be ">", "<", "little", or "big". default is `sys.byteorder` 

3527 time_stamp : datetime, default None 

3528 A datetime to use as file creation date. Default is the current time 

3529 data_label : str, default None 

3530 A label for the data set. Must be 80 characters or smaller. 

3531 variable_labels : dict, default None 

3532 Dictionary containing columns as keys and variable labels as values. 

3533 Each label must be 80 characters or smaller. 

3534 convert_strl : list, default None 

3535 List of columns names to convert to Stata StrL format. Columns with 

3536 more than 2045 characters are automatically written as StrL. 

3537 Smaller columns can be converted by including the column name. Using 

3538 StrLs can reduce output file size when strings are longer than 8 

3539 characters, and either frequently repeated or sparse. 

3540 version : int, default None 

3541 The dta version to use. By default, uses the size of data to determine 

3542 the version. 118 is used if data.shape[1] <= 32767, and 119 is used 

3543 for storing larger DataFrames. 

3544 {compression_options} 

3545 

3546 .. versionadded:: 1.1.0 

3547 

3548 .. versionchanged:: 1.4.0 Zstandard support. 

3549 

3550 value_labels : dict of dicts 

3551 Dictionary containing columns as keys and dictionaries of column value 

3552 to labels as values. The combined length of all labels for a single 

3553 variable must be 32,000 characters or smaller. 

3554 

3555 .. versionadded:: 1.4.0 

3556 

3557 Returns 

3558 ------- 

3559 StataWriterUTF8 

3560 The instance has a write_file method, which will write the file to the 

3561 given `fname`. 

3562 

3563 Raises 

3564 ------ 

3565 NotImplementedError 

3566 * If datetimes contain timezone information 

3567 ValueError 

3568 * Columns listed in convert_dates are neither datetime64[ns] 

3569 or datetime.datetime 

3570 * Column dtype is not representable in Stata 

3571 * Column listed in convert_dates is not in DataFrame 

3572 * Categorical label contains more than 32,000 characters 

3573 

3574 Examples 

3575 -------- 

3576 Using Unicode data and column names 

3577 

3578 >>> from pandas.io.stata import StataWriterUTF8 

3579 >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) 

3580 >>> writer = StataWriterUTF8('./data_file.dta', data) 

3581 >>> writer.write_file() 

3582 

3583 Directly write a zip file 

3584 >>> compression = {"method": "zip", "archive_name": "data_file.dta"} 

3585 >>> writer = StataWriterUTF8('./data_file.zip', data, compression=compression) 

3586 >>> writer.write_file() 

3587 

3588 Or with long strings stored in strl format 

3589 

3590 >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], 

3591 ... columns=['strls']) 

3592 >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data, 

3593 ... convert_strl=['strls']) 

3594 >>> writer.write_file() 

3595 """ 

3596 

3597 _encoding: Literal["utf-8"] = "utf-8" 

3598 

3599 def __init__( 

3600 self, 

3601 fname: FilePath | WriteBuffer[bytes], 

3602 data: DataFrame, 

3603 convert_dates: dict[Hashable, str] | None = None, 

3604 write_index: bool = True, 

3605 byteorder: str | None = None, 

3606 time_stamp: datetime.datetime | None = None, 

3607 data_label: str | None = None, 

3608 variable_labels: dict[Hashable, str] | None = None, 

3609 convert_strl: Sequence[Hashable] | None = None, 

3610 version: int | None = None, 

3611 compression: CompressionOptions = "infer", 

3612 storage_options: StorageOptions = None, 

3613 *, 

3614 value_labels: dict[Hashable, dict[float, str]] | None = None, 

3615 ) -> None: 

3616 if version is None: 

3617 version = 118 if data.shape[1] <= 32767 else 119 

3618 elif version not in (118, 119): 

3619 raise ValueError("version must be either 118 or 119.") 

3620 elif version == 118 and data.shape[1] > 32767: 

3621 raise ValueError( 

3622 "You must use version 119 for data sets containing more than" 

3623 "32,767 variables" 

3624 ) 

3625 

3626 super().__init__( 

3627 fname, 

3628 data, 

3629 convert_dates=convert_dates, 

3630 write_index=write_index, 

3631 byteorder=byteorder, 

3632 time_stamp=time_stamp, 

3633 data_label=data_label, 

3634 variable_labels=variable_labels, 

3635 value_labels=value_labels, 

3636 convert_strl=convert_strl, 

3637 compression=compression, 

3638 storage_options=storage_options, 

3639 ) 

3640 # Override version set in StataWriter117 init 

3641 self._dta_version = version 

3642 

3643 def _validate_variable_name(self, name: str) -> str: 

3644 """ 

3645 Validate variable names for Stata export. 

3646 

3647 Parameters 

3648 ---------- 

3649 name : str 

3650 Variable name 

3651 

3652 Returns 

3653 ------- 

3654 str 

3655 The validated name with invalid characters replaced with 

3656 underscores. 

3657 

3658 Notes 

3659 ----- 

3660 Stata 118+ support most unicode characters. The only limitation is in 

3661 the ascii range where the characters supported are a-z, A-Z, 0-9 and _. 

3662 """ 

3663 # High code points appear to be acceptable 

3664 for c in name: 

3665 if ( 

3666 ( 

3667 ord(c) < 128 

3668 and (c < "A" or c > "Z") 

3669 and (c < "a" or c > "z") 

3670 and (c < "0" or c > "9") 

3671 and c != "_" 

3672 ) 

3673 or 128 <= ord(c) < 192 

3674 or c in {"×", "÷"} 

3675 ): 

3676 name = name.replace(c, "_") 

3677 

3678 return name