diff --git a/README.md b/README.md index 7efd96d..0351daf 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,19 @@ A toolbox to benchmark workloads in [TPC](http://www.tpc.org/) ## Install +### Download binary + +You can download the pre-built binary [here](https://github.com/pingcap/go-tpc/releases) + +### Build from source + ```bash -make +git clone https://github.com/pingcap/go-tpc.git +make build ``` +Then you can find the `go-tpc` binary file in the `/bin` directory. + ## Usage By default, go-tpc uses `root::@tcp(127.0.0.1:4000)/test` as the default dsn address, you can override it by setting below flags: @@ -47,6 +56,8 @@ For example: ./bin/go-tpc tpcc --warehouses 4 prepare --output data --pprof :10111 ``` +If you want to import tpcc data into TiDB, please refer to [import-to-tidb](docs/import-to-tidb.md). + ### TPC-H ```bash diff --git a/docs/import-to-tidb.md b/docs/import-to-tidb.md new file mode 100644 index 0000000..7cd1db8 --- /dev/null +++ b/docs/import-to-tidb.md @@ -0,0 +1,117 @@ +# How to import tpcc data to TiDB + +Currently if you want to import tpcc dataset into the database via `go-tpc`, you have two ways: + +1. Using the `go-tpc prepare` to load data to DB directly, this is valid for both `MySQL` and `TiDB`. +2. Output the data into CSV files, then import the CSV files to `TiDB` with `lightning` (TiDB only) + +This document will explain how to use the above ways to import data into TiDB. For simplicity, we will +start a small `TiDB` cluster using `tiup playground`. + +``` bash +➜ ~ tiup playground +Starting /Users/yeya24/.tiup/components/playground/v0.0.9/playground +Playground Bootstrapping... +Starting /Users/yeya24/.tiup/components/pd/v4.0.0-rc.1/pd-server --name=pd-0 --data-dir=/Users/yeya24/.tiup/data/RyBAqGx/pd-0/data --peer-urls=http://127.0.0.1:2380 --advertise-peer-urls=http://127.0.0.1:2380 --client-urls=http://127.0.0.1:2379 --advertise-client-urls=http://127.0.0.1:2379 --log-file=/Users/yeya24/.tiup/data/RyBAqGx/pd-0/pd.log --initial-cluster=pd-0=http://127.0.0.1:2380 +Starting /Users/yeya24/.tiup/components/tidb/v4.0.0-rc.1/tidb-server -P 4000 --store=tikv --host=127.0.0.1 --status=10080 --path=127.0.0.1:2379 --log-file=/Users/yeya24/.tiup/data/RyBAqGx/tidb-0/tidb.log +Starting /Users/yeya24/.tiup/components/tikv/v4.0.0-rc.1/tikv-server --addr=127.0.0.1:20160 --status-addr=127.0.0.1:20180 --pd=http://127.0.0.1:2379 --config=/Users/yeya24/.tiup/data/RyBAqGx/tikv-0/tikv.toml --data-dir=/Users/yeya24/.tiup/data/RyBAqGx/tikv-0/data --log-file=/Users/yeya24/.tiup/data/RyBAqGx/tikv-0/tikv.log +..... +CLUSTER START SUCCESSFULLY, Enjoy it ^-^ +To connect TiDB: mysql --host 127.0.0.1 --port 4000 -u root +To view the dashboard: http://127.0.0.1:2379/dashboard +``` + +## Load data directly + +This way is very easy and straightforward. However, it will be slower than the second way because +it just executes the SQL `Insert` statement at the database. + +``` bash +go-tpc tpcc prepare --warehouses 100 -D test -H 127.0.0.1 -P 4000 -T 10 +``` + +Here the `warehouses` flag means the data size, the `-T` flag means how many threads we want to use for loading data. + +## Using CSV and lightning + +This method includes two steps: + +1. Export CSV files +2. Import CSV files to TiDB using lightning + +### Export CSV files + +``` bash +go-tpc tpcc prepare --warehouses 100 -D test -H 127.0.0.1 -P 4000 -T 16 --output-type csv --output-dir csv/ +``` + +In order to export CSV files, we need to specify two flags here. `--output-type` is what type of file we +want to export, and `--output-dir` is the directory of the exported CSV files. + +``` bash +ls csv/ +test.customer.0.csv test.district.14.csv test.history.6.csv test.order_line.1.csv test.orders.15.csv test.stock.7.csv +test.customer.1.csv test.district.15.csv test.history.7.csv test.order_line.10.csv test.orders.2.csv test.stock.8.csv +test.customer.10.csv test.district.2.csv test.history.8.csv test.order_line.11.csv test.orders.3.csv test.stock.9.csv +test.customer.11.csv test.district.3.csv test.history.9.csv test.order_line.12.csv test.orders.4.csv test.warehouse.0.csv +test.customer.12.csv test.district.4.csv test.item.0.csv test.order_line.13.csv test.orders.5.csv test.warehouse.1.csv +test.customer.13.csv test.district.5.csv test.new_order.0.csv test.order_line.14.csv test.orders.6.csv test.warehouse.10.csv +test.customer.14.csv test.district.6.csv test.new_order.1.csv test.order_line.15.csv test.orders.7.csv test.warehouse.11.csv +test.customer.15.csv test.district.7.csv test.new_order.10.csv test.order_line.2.csv test.orders.8.csv test.warehouse.12.csv +test.customer.2.csv test.district.8.csv test.new_order.11.csv test.order_line.3.csv test.orders.9.csv test.warehouse.13.csv +test.customer.3.csv test.district.9.csv test.new_order.12.csv test.order_line.4.csv test.stock.0.csv test.warehouse.14.csv +test.customer.4.csv test.history.0.csv test.new_order.13.csv test.order_line.5.csv test.stock.1.csv test.warehouse.15.csv +test.customer.5.csv test.history.1.csv test.new_order.14.csv test.order_line.6.csv test.stock.10.csv test.warehouse.2.csv +test.customer.6.csv test.history.10.csv test.new_order.15.csv test.order_line.7.csv test.stock.11.csv test.warehouse.3.csv +test.customer.7.csv test.history.11.csv test.new_order.2.csv test.order_line.8.csv test.stock.12.csv test.warehouse.4.csv +test.customer.8.csv test.history.12.csv test.new_order.3.csv test.order_line.9.csv test.stock.13.csv test.warehouse.5.csv +test.customer.9.csv test.history.13.csv test.new_order.4.csv test.orders.0.csv test.stock.14.csv test.warehouse.6.csv +test.district.0.csv test.history.14.csv test.new_order.5.csv test.orders.1.csv test.stock.15.csv test.warehouse.7.csv +test.district.1.csv test.history.15.csv test.new_order.6.csv test.orders.10.csv test.stock.2.csv test.warehouse.8.csv +test.district.10.csv test.history.2.csv test.new_order.7.csv test.orders.11.csv test.stock.3.csv test.warehouse.9.csv +test.district.11.csv test.history.3.csv test.new_order.8.csv test.orders.12.csv test.stock.4.csv +test.district.12.csv test.history.4.csv test.new_order.9.csv test.orders.13.csv test.stock.5.csv +test.district.13.csv test.history.5.csv test.order_line.0.csv test.orders.14.csv test.stock.6.csv +``` + +After exporting the files, we can check them in the directory. Here all CSV files conform to the naming scheme `...csv`. + +Please note that no matter how many threads you are using, there is only `test.item.0.csv` for the item table since we only +use one thread to create that. + +### Import data using lightning + +Since `Tiup` doesn't support `lightning` so far, we have to download the binary somewhere or build it from source. +For simplicity, this document will not include that part, please refer to [lightning doc](https://pingcap.com/docs/stable/reference/tools/tidb-lightning/overview/) for more details. + +With the `lightning` binary, then it is easy to import data. We also provide an example [config](./tidb-lightning.toml) for `lightning`. You can just execute the command below: + +```bash +lightning -c tidb-lightning.toml +``` + +Please note that: +1. This example config uses `tidb` as `lightning` backend instead of `importer`. Please update related configs if you are using `importer`. +2. Please change `data-source-dir` field to the CSV directory you set in the previous step. +3. Please update the `tidb` section if you have a different set up. + +For the status of the import process, please check `tidb-lightning.log`, if you see the logs below, then it is perfect! + +```bash +[2020/05/03 12:51:25.004 -04:00] [INFO] [backend.go:265] ["engine close start"] [engineTag=`test`.`stock`:-1] [engineUUID=5565f8ab-07bc-5dfb-a64c-717945dd3a64] +[2020/05/03 12:51:25.004 -04:00] [INFO] [backend.go:267] ["engine close completed"] [engineTag=`test`.`stock`:-1] [engineUUID=5565f8ab-07bc-5dfb-a64c-717945dd3a64] [takeTime=210ns] [] +[2020/05/03 12:51:25.004 -04:00] [INFO] [restore.go:1422] ["import and cleanup engine start"] [engineTag=`test`.`stock`:-1] [engineUUID=5565f8ab-07bc-5dfb-a64c-717945dd3a64] +[2020/05/03 12:51:25.004 -04:00] [INFO] [backend.go:279] ["import start"] [engineTag=`test`.`stock`:-1] [engineUUID=5565f8ab-07bc-5dfb-a64c-717945dd3a64] [retryCnt=0] +[2020/05/03 12:51:25.004 -04:00] [INFO] [backend.go:282] ["import completed"] [engineTag=`test`.`stock`:-1] [engineUUID=5565f8ab-07bc-5dfb-a64c-717945dd3a64] [retryCnt=0] [takeTime=304ns] [] +[2020/05/03 12:51:25.004 -04:00] [INFO] [backend.go:294] ["cleanup start"] [engineTag=`test`.`stock`:-1] [engineUUID=5565f8ab-07bc-5dfb-a64c-717945dd3a64] +[2020/05/03 12:51:25.004 -04:00] [INFO] [backend.go:296] ["cleanup completed"] [engineTag=`test`.`stock`:-1] [engineUUID=5565f8ab-07bc-5dfb-a64c-717945dd3a64] [takeTime=189ns] [] +[2020/05/03 12:51:25.004 -04:00] [INFO] [restore.go:1429] ["import and cleanup engine completed"] [engineTag=`test`.`stock`:-1] [engineUUID=5565f8ab-07bc-5dfb-a64c-717945dd3a64] [takeTime=54.46µs] [] +[2020/05/03 12:51:25.004 -04:00] [INFO] [restore.go:602] ["restore table completed"] [table=`test`.`stock`] [takeTime=29.720372962s] [] +[2020/05/03 12:51:25.004 -04:00] [INFO] [restore.go:697] ["restore all tables data completed"] [takeTime=38.919570374s] [] +[2020/05/03 12:51:25.004 -04:00] [INFO] [restore.go:475] ["everything imported, stopping periodic actions"] +[2020/05/03 12:51:25.004 -04:00] [INFO] [restore.go:1072] ["skip full compaction"] +[2020/05/03 12:51:25.014 -04:00] [INFO] [restore.go:1241] ["clean checkpoints start"] [keepAfterSuccess=false] [taskID=1588524646072446000] +[2020/05/03 12:51:25.014 -04:00] [INFO] [restore.go:1248] ["clean checkpoints completed"] [keepAfterSuccess=false] [taskID=1588524646072446000] [takeTime=152.037µs] [] +[2020/05/03 12:51:25.014 -04:00] [INFO] [restore.go:283] ["the whole procedure completed"] [takeTime=38.936017956s] [] +[2020/05/03 12:51:25.014 -04:00] [INFO] [main.go:77] ["tidb lightning exit"] +``` diff --git a/docs/tidb-lightning.toml b/docs/tidb-lightning.toml new file mode 100644 index 0000000..973f406 --- /dev/null +++ b/docs/tidb-lightning.toml @@ -0,0 +1,156 @@ +### tidb-lightning configuration +[lightning] + +# Listening address for the HTTP server (set to empty string to disable). +# The server is responsible for the web interface, submitting import tasks, +# serving Prometheus metrics and exposing debug profiling data. +status-addr = "" + +# Toggle server mode. +# If "false", running Lightning will immediately start the import job, and exits +# after the job is finished. +# If "true", running Lightning will wait for user to submit tasks, via the HTTP API +# (`curl http://lightning-ip:8289/tasks --data-binary @tidb-lightning.toml`). +# The program will keep running and waiting for more tasks, until receiving the SIGINT signal. +server-mode = false + +# check if the cluster satisfies the minimum requirement before starting +# check-requirements = true + +# index-concurrency controls the maximum handled index concurrently while reading Mydumper SQL files. It can affect the tikv-importer disk usage. +index-concurrency = 2 +# table-concurrency controls the maximum handled tables concurrently while reading Mydumper SQL files. It can affect the tikv-importer memory usage. +table-concurrency = 6 + +# logging +level = "info" +file = "tidb-lightning.log" +max-size = 128 # MB +max-days = 28 +max-backups = 14 + + +[checkpoint] +# Whether to enable checkpoints. +# While importing, Lightning will record which tables have been imported, so even if Lightning or other component +# crashed, we could start from a known good state instead of redoing everything. +enable = true +# The schema name (database name) to store the checkpoints +schema = "tidb_lightning_checkpoint" +# Where to store the checkpoints. +# Set to "file" to store as a local file. +# Set to "mysql" to store into a remote MySQL-compatible database +driver = "file" +# The data source name (DSN) indicating the location of the checkpoint storage. +# For "file" driver, the DSN is a path. If not specified, Lightning would default to "/tmp/CHKPTSCHEMA.pb". +# For "mysql" driver, the DSN is a URL in the form "USER:PASS@tcp(HOST:PORT)/". +# If not specified, the TiDB server from the [tidb] section will be used to store the checkpoints. +#dsn = "/tmp/tidb_lightning_checkpoint.pb" +# Whether to keep the checkpoints after all data are imported. If false, the checkpoints will be deleted. The schema +# needs to be dropped manually, however. +#keep-after-success = false + + +[tikv-importer] +# Delivery backend, can be "importer" or "tidb". +backend = "tidb" + +# What to do on duplicated record (unique key conflict) when the backend is 'tidb'. Possible values are: +# - replace: replace the old record by the new record (i.e. insert rows using "REPLACE INTO") +# - ignore: keep the old record and ignore the new record (i.e. insert rows using "INSERT IGNORE INTO") +# - error: stop Lightning and report an error (i.e. insert rows using "INSERT INTO") +#on-duplicate = "replace" + +[mydumper] +# block size of file reading +read-block-size = 65536 # Byte (default = 64 KB) +# minimum size (in terms of source data file) of each batch of import. +# Lightning will split a large table into multiple engine files according to this size. +batch-size = 107_374_182_400 # Byte (default = 100 GiB) + +# Engine file needs to be imported sequentially. Due to table-concurrency, multiple engines will be +# imported nearly the same time, and this will create a queue and this wastes resources. Therefore, +# Lightning will slightly increase the size of the first few batches to properly distribute +# resources. The scale up is controlled by this parameter, which expresses the ratio of duration +# between the "import" and "write" steps with full concurrency. This can be calculated as the ratio +# (import duration / write duration) of a single table of size around 1 GB. The exact timing can be +# found in the log. If "import" is faster, the batch size anomaly is smaller, and a ratio of +# zero means uniform batch size. This value should be in the range (0 <= batch-import-ratio < 1). +batch-import-ratio = 0.75 + +# mydumper local source data directory, please change to the directory of your csv file path +data-source-dir = "/data" +# if no-schema is set true, lightning will get schema information from tidb-server directly without creating them. +no-schema=true +# the character set of the schema files; only supports one of: +# - utf8mb4: the schema files must be encoded as UTF-8, otherwise will emit errors +# - gb18030: the schema files must be encoded as GB-18030, otherwise will emit errors +# - auto: (default) automatically detect if the schema is UTF-8 or GB-18030, error if the encoding is neither +# - binary: do not try to decode the schema files +# note that the *data* files are always parsed as binary regardless of schema encoding. +#character-set = "auto" + +# make table and database names case-sensitive, i.e. treats `DB`.`TBL` and `db`.`tbl` as two +# different objects. Currently only affects [[routes]]. +case-sensitive = false + +# CSV files are imported according to MySQL's LOAD DATA INFILE rules. +[mydumper.csv] +# separator between fields, should be an ASCII character. +separator = ',' +# string delimiter, can either be an ASCII character or empty string. +delimiter = "" +# whether the CSV files contain a header. If true, the first line will be skipped +header = false +# whether the CSV contains any NULL value. If true, all columns from CSV cannot be NULL. +not-null = false +# if non-null = false (i.e. CSV can contain NULL), fields equal to this value will be treated as NULL +null = "NULL" +# whether to interpret backslash-escape inside strings. +backslash-escape = false +# if a line ends with a separator, remove it. +trim-last-separator = false + +# configuration for tidb server address(one is enough) and pd server address(one is enough). +[tidb] +host = "127.0.0.1" +port = 4000 +user = "root" +password = "" +# table schema information is fetched from tidb via this status-port. +status-port = 10080 +pd-addr = "127.0.0.1:2379" +# lightning uses some code of tidb(used as library), and the flag controls it's log level. +log-level = "error" + +# set tidb session variables to speed up checksum/analyze table. +# see https://pingcap.com/docs/sql/statistics/#control-analyze-concurrency for the meaning of each setting +build-stats-concurrency = 20 +distsql-scan-concurrency = 100 +index-serial-scan-concurrency = 20 +checksum-table-concurrency = 16 + + +# post-restore provide some options which will be executed after all kv data has been imported into the tikv cluster. +# the execution order are(if set true): checksum -> analyze +[post-restore] +# if set true, checksum will do ADMIN CHECKSUM TABLE
for each table. +checksum = true +# if set to true, compact will do level 1 compaction to tikv data. +# if this setting is missing, the default value is false. +level-1-compact = false +# if set true, compact will do full compaction to tikv data. +# if this setting is missing, the default value is false. +compact = false +# if set true, analyze will do ANALYZE TABLE
for each table. +analyze = true + +# cron performs some periodic actions in background +[cron] +# duration between which Lightning will automatically refresh the import mode status. +# should be shorter than the corresponding TiKV setting +switch-mode = "5m" +# the duration which the an import progress will be printed to the log. +log-progress = "5m" + +