from typing import Iterable, Dict import gzip import json import os ROOT = os.path.dirname(os.path.abspath(__file__)) HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz") def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]: return {task["task_id"]: task for task in stream_jsonl(evalset_file)} def stream_jsonl(filename: str) -> Iterable[Dict]: """ Parses each jsonl line and yields it as a dictionary """ if filename.endswith(".gz"): with open(filename, "rb") as gzfp: with gzip.open(gzfp, 'rt') as fp: for line in fp: if any(not x.isspace() for x in line): yield json.loads(line) else: with open(filename, "r", encoding="utf-8") as fp: for line in fp: if any(not x.isspace() for x in line): yield json.loads(line) def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False): """ Writes an iterable of dictionaries to jsonl """ if append: mode = 'ab' else: mode = 'wb' filename = os.path.expanduser(filename) if filename.endswith(".gz"): with open(filename, mode) as fp: with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp: for x in data: gzfp.write((json.dumps(x) + "\n").encode('utf-8')) else: with open(filename, mode) as fp: for x in data: fp.write((json.dumps(x) + "\n").encode('utf-8'))