mirror of
https://github.com/NixOS/nixpkgs.git
synced 2024-11-26 08:53:21 +00:00
Merge pull request #274838 from pbsds/tokenizers-linkfarm
python3Packages.tokenizers: pack test assets in linkFarm
This commit is contained in:
commit
e4a8ac0787
@ -1,5 +1,6 @@
|
||||
{ lib
|
||||
, stdenv
|
||||
, linkFarm
|
||||
, buildPythonPackage
|
||||
, cargo
|
||||
, datasets
|
||||
@ -21,41 +22,43 @@
|
||||
let
|
||||
# See https://github.com/huggingface/tokenizers/blob/main/bindings/python/tests/utils.py for details
|
||||
# about URLs and file names
|
||||
robertaVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
|
||||
sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
|
||||
};
|
||||
robertaMerges = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
|
||||
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
|
||||
};
|
||||
albertVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
|
||||
sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
|
||||
};
|
||||
bertVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
|
||||
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
|
||||
};
|
||||
norvigBig = fetchurl {
|
||||
url = "https://norvig.com/big.txt";
|
||||
sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
|
||||
};
|
||||
docPipelineTokenizer = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
|
||||
hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
|
||||
};
|
||||
docQuicktourTokenizer = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
|
||||
hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
|
||||
};
|
||||
openaiVocab = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
|
||||
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
|
||||
};
|
||||
openaiMerges = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
|
||||
sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
|
||||
test-data = linkFarm "tokenizers-test-data" {
|
||||
"roberta-base-vocab.json" = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
|
||||
sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
|
||||
};
|
||||
"roberta-base-merges.txt" = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
|
||||
sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
|
||||
};
|
||||
"albert-base-v1-tokenizer.json" = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json";
|
||||
sha256 = "1hra9pn8rczx7378z88zjclw2qsdrdwq20m56sy42s2crbas6akf";
|
||||
};
|
||||
"bert-base-uncased-vocab.txt" = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
|
||||
sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
|
||||
};
|
||||
"big.txt" = fetchurl {
|
||||
url = "https://norvig.com/big.txt";
|
||||
sha256 = "0yz80icdly7na03cfpl0nfk5h3j3cam55rj486n03wph81ynq1ps";
|
||||
};
|
||||
"bert-wiki.json" = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json";
|
||||
hash = "sha256-i533xC8J5CDMNxBjo+p6avIM8UOcui8RmGAmK0GmfBc=";
|
||||
};
|
||||
"tokenizer-wiki.json" = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json";
|
||||
hash = "sha256-ipY9d5DR5nxoO6kj7rItueZ9AO5wq9+Nzr6GuEIfIBI=";
|
||||
};
|
||||
"openai-gpt-vocab.json" = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
|
||||
sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
|
||||
};
|
||||
"openai-gpt-merges.txt" = fetchurl {
|
||||
url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
|
||||
sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
|
||||
};
|
||||
};
|
||||
in
|
||||
buildPythonPackage rec {
|
||||
@ -107,16 +110,7 @@ buildPythonPackage rec {
|
||||
postUnpack = ''
|
||||
# Add data files for tests, otherwise tests attempt network access
|
||||
mkdir $sourceRoot/tests/data
|
||||
( cd $sourceRoot/tests/data
|
||||
ln -s ${robertaVocab} roberta-base-vocab.json
|
||||
ln -s ${robertaMerges} roberta-base-merges.txt
|
||||
ln -s ${albertVocab} albert-base-v1-tokenizer.json
|
||||
ln -s ${bertVocab} bert-base-uncased-vocab.txt
|
||||
ln -s ${docPipelineTokenizer} bert-wiki.json
|
||||
ln -s ${docQuicktourTokenizer} tokenizer-wiki.json
|
||||
ln -s ${norvigBig} big.txt
|
||||
ln -s ${openaiVocab} openai-gpt-vocab.json
|
||||
ln -s ${openaiMerges} openai-gpt-merges.txt )
|
||||
ln -s ${test-data}/* $sourceRoot/tests/data/
|
||||
'';
|
||||
|
||||
preCheck = ''
|
||||
|
Loading…
Reference in New Issue
Block a user