Python mailbox.mbox
is not good at opening compressed mailboxes:
>>> import mailbox >>> print(len(mailbox.mbox("/tmp/test.mbox"))) 9 >>> print(len(mailbox.mbox("/tmp/test.mbox.gz"))) 0 >>> print(len(mailbox.mbox("/tmp/test1.mbox.xz"))) 0
For a prototype rewrite of the MIA team's Echelon (the engine behind mia-query), I needed to scan compressed mailboxes, and I had to work around this limitation.
Here is the alternative mailbox.mbox
implementation:
import lzma import gzip import bz2 import mailbox class StreamMbox(mailbox.mbox): """ mailbox.mbox does not support opening a stream, which is sad. This is a subclass that works around it """ def __init__(self, fd: BinaryIO, factory=None, create: bool = True): # Do not call parent __init__, just redo everything here to be able to # open a stream. This will need to be re-reviewed for every new version # of python's stdlib. # Mailbox constructor self._path = None self._factory = factory # _singlefileMailbox constructor self._file = fd self._toc = None self._next_key = 0 self._pending = False # No changes require rewriting the file. self._pending_sync = False # No need to sync the file self._locked = False self._file_length = None # Used to record mailbox size # mbox constructor self._message_factory = mailbox.mboxMessage def flush(self): raise NotImplementedError("StreamMbox is a readonly class") class UsageExample: DECOMPRESS = { ".xz": lzma.open, ".gz": gzip.open, ".bz2": bz2.open, } @classmethod def scan(cls, path: Path) -> Generator[ScannedEmail, None, None]: decompress = cls.DECOMPRESS.get(path.suffix) if decompress is None: with open(path.as_posix(), "rb") as fd: yield from cls.scan_fd(path, fd) else: with decompress(path.as_posix(), "rb") as fd: yield from cls.scan_fd(path, fd) @classmethod def scan_fd(cls, path: Path, fd: BinaryIO) -> Generator[ScannedEmail, None, None]: mbox = StreamMbox(fd) for msg in mbox: ...