"""the_pile dataset"""

import tensorflow_datasets as tfds
import tensorflow as tf
import io
import zstandard
import jsonlines
import os

class JsonReader:
    def __init__(self, filenames, para_joiner='\n\n'):
        if not isinstance(filenames, list):
            filenames = [filenames]
        self.filenames = filenames

    def _read_fn(self, filename):
        with tf.io.gfile.GFile(filename, 'rb+') as f:
            reader = jsonlines.Reader(f)
            for item in reader:
                result = dict()
                result['targets'] = item['text']
                # result['meta'] = item['meta']
                yield result
    
    def __iter__(self):
        for filename in self.filenames:
            return self._read_fn(filename)

class TheJsonConfig(tfds.core.BuilderConfig):
    def __init__(self, *, mode=None, **kwargs):
        super(TheJsonConfig, self).__init__(
            name=mode,
            description="The json dataset",
            **kwargs)

    
class TheJsonData(tfds.core.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [
        TheJsonConfig(version=tfds.core.Version('0.0.0'), mode=mode) for mode in ["lm"]
    ]
    
    def _info(self) -> tfds.core.DatasetInfo:
        return tfds.core.DatasetInfo(
            builder=self,
            description="json data",
            features=tfds.features.FeaturesDict({
                'text': tfds.features.Text(),
                'meta': tfds.features.Text()
            }),
        )

    def _generate_examples(self):
        print(self.data_dir)
        pipeline = JsonReader(tf.io.gfile.glob(paths))
        for x, result in enumerate(pipeline):
            if result:
                idx = f'{x}'
                yield idx, {'text': result['text'], 'meta': result['meta']}