From 29df0bf02bc408cef3e34ff9fb20e686b0f5ff7e Mon Sep 17 00:00:00 2001 From: ds <63077097+dsmedia@users.noreply.github.com> Date: Sun, 8 Dec 2024 13:29:28 -0500 Subject: [PATCH 01/40] feat: Add SOURCES.toml for dataset metadata - Add SOURCES.toml to provide supplemental (extrinisic) metadata on datasets, from SOURCES.md, in a form usable by build_datapackage.py - Include resource descriptions, sources, and licenses to supplement script output - Preserve existing markdown content for future documentation - TODO: Remove duplicated content between descriptions and sources - TODO: Incorporate resource-level column descriptions into table schema, where available - TODO: determine if root-level $schema property should be specified in the TOML file with the value "https://datapackage.org/profiles/2.0/datapackage.json" per Frictionless Data guidelines Resolves #634 --- SOURCES.toml | 630 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 630 insertions(+) create mode 100644 SOURCES.toml diff --git a/SOURCES.toml b/SOURCES.toml new file mode 100644 index 00000000..c2585cf8 --- /dev/null +++ b/SOURCES.toml @@ -0,0 +1,630 @@ +[[resources]] # Path: 7zip.png +path = "7zip.png" +description = """Application icons from open-source software projects.""" + +[[resources]] # Path: airports.csv +path = "airports.csv" + +[[resources]] # Path: annual-precip.json +path = "annual-precip.json" +description = """A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell, from [CFSv2](https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2).""" + +[[resources.sources]] +title = "Climate Forecast System Version 2" +path = "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2" + +[[resources]] # Path: anscombe.json +path = "anscombe.json" +description = """Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.""" + +[[resources]] # Path: barley.json +path = "barley.json" +description = """The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption.\" R.A. Fisher's popularized its use in the field of statistics when he included it in his book [\"The Design of Experiments.\"](https://en.wikipedia.org/wiki/The_Design_of_Experiments) Since then it has been used to demonstrate new statistical techniques, including the [trellis charts](http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf) developed by Richard Becker, William Cleveland and others in the 1990s.""" + +[[resources.sources]] +title = "The Design of Experiments Reference" +path = "https://en.wikipedia.org/wiki/The_Design_of_Experiments" +[[resources.sources]] +title = "Trellis Charts Paper" +path = "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf" + +[[resources]] # Path: birdstrikes.csv +path = "birdstrikes.csv" +description = """http://wildlife.faa.gov""" + +[[resources.sources]] +title = "FAA Wildlife Strike Database" +path = "http://wildlife.faa.gov" + +[[resources]] # Path: budget.json +path = "budget.json" +description = """Source: Office of Management and Budget (U.S.) +[Budget FY 2016 - Receipts](https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3)""" + +[[resources.sources]] +title = "Office of Management and Budget - Budget FY 2016" +path = "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3" + +[[resources]] # Path: budgets.json +path = "budgets.json" + +[[resources]] # Path: burtin.json +path = "burtin.json" +description = """The burtin.json dataset is based on graphic designer [Will Burtin's](https://en.wikipedia.org/wiki/Will_Burtin) 1951 visualization of antibiotic effectiveness, originally published in [Scope Magazine](https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/). The dataset compares the performance of three antibiotics against 16 different bacteria. The numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. The dataset was featured as an [example](https://mbostock.github.io/protovis/ex/antibiotics-burtin.html) in the Protovis project, a precursor to D3.js. The Protovis example notes that, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin.\" The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. +The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) reads as follows: +> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin + +> +> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin. The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits the test organism. High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness. It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis. Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin. It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.""" +[[resources.sources]] +title = "Scope Magazine" +path = "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/" +[[resources.sources]] +title = "Protovis Antibiotics Example" +path = "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html" + +[[resources]] # Path: cars.json +path = "cars.json" +description = """http://lib.stat.cmu.edu/datasets/""" + +[[resources.sources]] +title = "StatLib Datasets Archive" +path = "http://lib.stat.cmu.edu/datasets/" + +[[resources]] # Path: co2-concentration.csv +path = "co2-concentration.csv" +description = """https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record but modified to only include date, CO2, seasonally adjusted CO2 and only include rows with valid data.""" + +[[resources.sources]] +title = "Scripps CO2 Program" +path = "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record" + +[[resources]] # Path: countries.json +path = "countries.json" +description = """- **Original Data**: [Gapminder Foundation](https://www.gapminder.org/) +- **URLs**: + +- Life Expectancy (v14): [Data](https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd004/) +- Fertility (v14): [Data](https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd008/) +- **Date Accessed**: July 31, 2024 +- **License**: Creative Commons Attribution 4.0 International (CC BY 4.0) | [Reference](https://www.gapminder.org/free-material/) +This dataset combines key demographic indicators (life expectancy at birth and fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year intervals. It includes both current values and adjacent time period values (previous and next) for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to \"show people the big picture\" rather than support detailed numeric analysis. +1. `year` (type: integer): Years from 1955 to 2000 at 5-year intervals +2. `country` (type: string): Name of the country +3. `fertility` (type: float): Fertility rate (average number of children per woman) for the given year +4. `life_expect` (type: float): Life expectancy in years for the given year +5. `p_fertility` (type: float): Fertility rate for the previous 5-year interval +6. `n_fertility` (type: float): Fertility rate for the next 5-year interval +7. `p_life_expect` (type: float): Life expectancy for the previous 5-year interval +8. `n_life_expect` (type: float): Life expectancy for the next 5-year interval""" +[[resources.sources]] +title = "Gapminder Foundation - Life Expectancy" +path = "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676" +version = "14" +[[resources.sources]] +title = "Gapminder Foundation - Fertility" +path = "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676" +version = "14" +[[resources.licenses]] +title = "Creative Commons Attribution 4.0 International" +path = "https://www.gapminder.org/free-material/" + +[[resources]] # Path: crimea.json +path = "crimea.json" + +[[resources]] # Path: disasters.csv +path = "disasters.csv" +description = """https://ourworldindata.org/natural-catastrophes""" + +[[resources.sources]] +title = "Our World in Data - Natural Catastrophes" +path = "https://ourworldindata.org/natural-catastrophes" + +[[resources]] # Path: driving.json +path = "driving.json" +description = """https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html""" + +[[resources.sources]] +title = "New York Times" +path = "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html" + +[[resources]] # Path: earthquakes.json +path = "earthquakes.json" +description = """https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson +(Feb 6, 2018)""" + +[[resources.sources]] +title = "USGS Earthquake Feed" +path = "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson" + +[[resources]] # Path: ffox.png +path = "ffox.png" +description = """Application icons from open-source software projects.""" + +[[resources]] # Path: flare-dependencies.json +path = "flare-dependencies.json" + +[[resources]] # Path: flare.json +path = "flare.json" + +[[resources]] # Path: flights-10k.json +path = "flights-10k.json" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr +Transformed using `/scripts/flights.py`""" + +[[resources.sources]] +title = "U.S. Bureau of Transportation Statistics" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + +[[resources]] # Path: flights-200k.arrow +path = "flights-200k.arrow" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr +Transformed using `/scripts/flights.py`""" + +[[resources.sources]] +title = "U.S. Bureau of Transportation Statistics" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + +[[resources]] # Path: flights-200k.json +path = "flights-200k.json" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr +Transformed using `/scripts/flights.py`""" + +[[resources.sources]] +title = "U.S. Bureau of Transportation Statistics" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + +[[resources]] # Path: flights-20k.json +path = "flights-20k.json" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr +Transformed using `/scripts/flights.py`""" + +[[resources.sources]] +title = "U.S. Bureau of Transportation Statistics" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + +[[resources]] # Path: flights-2k.json +path = "flights-2k.json" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr +Transformed using `/scripts/flights.py`""" + +[[resources.sources]] +title = "U.S. Bureau of Transportation Statistics" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + +[[resources]] # Path: flights-3m.parquet +path = "flights-3m.parquet" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr +Transformed using `/scripts/flights.py`""" + +[[resources.sources]] +title = "U.S. Bureau of Transportation Statistics" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + +[[resources]] # Path: flights-5k.json +path = "flights-5k.json" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr +Transformed using `/scripts/flights.py`""" + +[[resources.sources]] +title = "U.S. Bureau of Transportation Statistics" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + +[[resources]] # Path: flights-airport.csv +path = "flights-airport.csv" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr +Transformed using `/scripts/flights.py`""" + +[[resources.sources]] +title = "U.S. Bureau of Transportation Statistics" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + +[[resources]] # Path: football.json +path = "football.json" +description = """Football match outcomes across multiple divisions from 2013 to 2017. This dataset is a subset of a larger dataset from https://github.com/openfootball/football.json. The subset was made such that there are records for all five chosen divisions over the time period.""" + +[[resources.sources]] +title = "OpenFootball" +path = "https://github.com/openfootball/football.json" + +[[resources]] # Path: gapminder-health-income.csv +path = "gapminder-health-income.csv" +description = """**Original Data**: [Gapminder Foundation](https://www.gapminder.org/) +**Description** Per-capita income, life expectancy, population and regional grouping. Dataset does not specify the reference year for the data. Gapminder historical data is subject to revisions. + +Gapminder (v30, 2023) defines per-capita income as follows: +>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) converted to international dollars using purchasing power parity rates. An international dollar has the same purchasing power over GDP as the U.S. dollar has in the United States.\" | [Source](https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268) +**License**: Creative Commons Attribution 4.0 International (CC BY 4.0) | [Reference](https://www.gapminder.org/free-material/)""" +[[resources.sources]] +title = "Gapminder Foundation" +path = "https://www.gapminder.org" +[[resources.sources]] +title = "Gapminder GDP Per Capita Data" +path = "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268" + +[[resources]] # Path: gapminder.json +path = "gapminder.json" +description = """- **Original Data**: [Gapminder Foundation](https://www.gapminder.org/) +- **URLs**: + +- Life Expectancy (v14): [Data](https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd004/) +- Population (v7): [Data](https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd003/) +- Fertility (v14): [Data](https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd008/) +- Data Geographies (v2): [Data](https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158) | [Reference](https://www.gapminder.org/data/geo/) +- **Date Accessed**: July 11, 2024 +- **License**: Creative Commons Attribution 4.0 International (CC BY 4.0) | [Reference](https://www.gapminder.org/free-material/) +This dataset combines key demographic indicators (life expectancy at birth, population, and fertility rate measured as babies per woman) for various countries from 1955 to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable grouping countries. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to \"show people the big picture\" rather than support detailed numeric analysis. +1. `year` (type: integer): Years from 1955 to 2005 at 5-year intervals +2. `country` (type: string): Name of the country +3. `cluster` (type: integer): A categorical variable (values 0-5) grouping countries. See Revision Notes for details. +4. `pop` (type: integer): Population of the country +5. `life_expect` (type: float): Life expectancy in years +6. `fertility` (type: float): Fertility rate (average number of children per woman) +1. Country Selection: The set of countries in this file matches the version of this dataset originally added to this collection in 2015. The specific criteria for country selection in that version are not known. Data for Aruba are no longer available in the new version. Hong Kong has been revised to Hong Kong, China in the new version. +2. Data Precision: The precision of float values may have changed from the original version. These changes reflect the most recent source data used for each indicator. +3. Regional Groupings: The 'cluster' column represents a regional mapping of countries corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To preserve continuity with previous versions of this dataset, we have retained the column name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: +`0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.""" +[[resources.sources]] +title = "Gapminder Foundation - Population" +path = "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676" +version = "7" +[[resources.sources]] +title = "Gapminder Foundation - Data Geographies" +path = "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158" +version = "2" +[[resources.sources]] +title = "Gapminder Data Documentation" +path = "https://www.gapminder.org/data/documentation/" + +[[resources]] # Path: gimp.png +path = "gimp.png" +description = """Application icons from open-source software projects.""" + +[[resources]] # Path: github.csv +path = "github.csv" +description = """Generated using `/scripts/github.py`.""" + +[[resources]] # Path: global-temp.csv +path = "global-temp.csv" +description = """Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023. Source: NASA's Goddard Institute for Space Studies https://data.giss.nasa.gov/gistemp/""" + +[[resources.sources]] +title = "NASA Goddard Institute for Space Studies" +path = "https://data.giss.nasa.gov/gistemp/" + +[[resources]] # Path: income.json +path = "income.json" + +[[resources]] # Path: iowa-electricity.csv +path = "iowa-electricity.csv" +description = """The state of Iowa has dramatically increased its production of renewable wind power in recent years. This file contains the annual net generation of electricity in the state by source in thousand megawatthours. The dataset was compiled by the [U.S. Energy Information Administration](https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin=) and downloaded on May 6, 2018. It is useful for illustrating stacked area charts.""" + +[[resources.sources]] +title = "U.S. Energy Information Administration" +path = "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin=" + +[[resources]] # Path: jobs.json +path = "jobs.json" +description = """U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790. +Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). +Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of [IPUMS USA](https://usa.ipums.org/usa/), according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). +The dataset is structured as follows: + +- job: The occupation title +- sex: Sex (men/women) +- year: Census year +- count: Number of individuals in the occupation +- perc: Percentage of the workforce in the occupation +IPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating: +>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared. +This dataset contains only summary statistics and does not include any underlying microdata records. +1. This dataset represents summary data. The underlying microdata records are not included. +2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) (person weight) variable as an expansion factor when working with IPUMS USA extracts. +3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly. +When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). The organization requests use of the following citation for this json file: +Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0""" +[[resources.sources]] +title = "IPUMS USA" +path = "https://usa.ipums.org/usa/" + +[[resources]] # Path: la-riots.csv +path = "la-riots.csv" +description = """More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic coordinates of their death. It was compiled and published by the [Los Angeles Times Data Desk](http://spreadsheets.latimes.com/la-riots-deaths/).""" + +[[resources.sources]] +title = "Los Angeles Times Data Desk" +path = "http://spreadsheets.latimes.com/la-riots-deaths/" + +[[resources]] # Path: londonBoroughs.json +path = "londonBoroughs.json" +description = """Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile held at https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london. Original data \"contains National Statistics data © Crown copyright and database right (2015)\" and \"Contains Ordnance Survey data © Crown copyright and database right [2015].""" + +[[resources.sources]] +title = "London Datastore" +path = "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london" + +[[resources]] # Path: londonCentroids.json +path = "londonCentroids.json" +description = """Calculated from `londongBoroughs.json` using `d3.geoCentroid`.""" + +[[resources]] # Path: londonTubeLines.json +path = "londonTubeLines.json" +description = """Selected rail lines simplified from `tfl_lines.json` at https://github.com/oobrien/vis/tree/master/tube/data""" + +[[resources.sources]] +title = "London Tube Data" +path = "https://github.com/oobrien/vis/tree/master/tube/data" + +[[resources]] # Path: lookup_groups.csv +path = "lookup_groups.csv" + +[[resources]] # Path: lookup_people.csv +path = "lookup_people.csv" + +[[resources]] # Path: miserables.json +path = "miserables.json" + +[[resources]] # Path: monarchs.json +path = "monarchs.json" +description = """A chronological list of English and British monarchs from Elizabeth I through George IV. +Each entry includes: + +- `name`: The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum) +- `start`: The year their rule began. +- `end`: The year their rule ended +- `index`: A [zero-based sequential number](https://en.wikipedia.org/wiki/Zero-based_numbering) assigned to each entry, representing the chronological order of rulers +- `commonwealth`: A Boolean flag (true) for the period from 1649 to 1660. This field is omitted for all other entries. +The dataset contains two intentional inaccuracies to maintain compatibility with the [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization: +1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558; +2. the end date for the reign of George IV is shown as 1820, instead of 1830. +These discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization. +The entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, the official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702. +The `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, and the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` from the original dataset is retained for backwards compatibility. +The dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689). +Source data has been verified against the [kings & queens](https://www.royal.uk/kings-and-queens-1066) and [interregnum](https://www.royal.uk/interregnum-1649-1660) [official website of the British royal family](https://www.royal.uk) pages of the official Web site of the British royal family (retrieved in Aug. 2024). Content on the site is protected by Crown Copyright. Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).""" +[[resources.sources]] +title = "The Royal Family" +path = "https://www.royal.uk/kings-and-queens-1066" + +[[resources]] # Path: movies.json +path = "movies.json" +description = """The dataset has well known and intentionally included errors. This dataset is used for instructional purposes, including the need to reckon with dirty data.""" + +[[resources]] # Path: normal-2d.json +path = "normal-2d.json" + +[[resources]] # Path: obesity.json +path = "obesity.json" + +[[resources]] # Path: ohlc.json +path = "ohlc.json" +description = """This dataset contains the performance of the Chicago Board Options Exchange [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX?ltr=1#eyJpbnRlcnZhbCI6ImRheSIsInBlcmlvZGljaXR5IjoxLCJ0aW1lVW5pdCI6bnVsbCwiY2FuZGxlV2lkdGgiOjgsInZvbHVtZVVuZGVybGF5Ijp0cnVlLCJhZGoiOnRydWUsImNyb3NzaGFpciI6dHJ1ZSwiY2hhcnRUeXBlIjoibGluZSIsImV4dGVuZGVkIjpmYWxzZSwibWFya2V0U2Vzc2lvbnMiOnt9LCJhZ2dyZWdhdGlvblR5cGUiOiJvaGxjIiwiY2hhcnRTY2FsZSI6ImxpbmVhciIsInN0dWRpZXMiOnsidm9sIHVuZHIiOnsidHlwZSI6InZvbCB1bmRyIiwiaW5wdXRzIjp7ImlkIjoidm9sIHVuZHIiLCJkaXNwbGF5Ijoidm9sIHVuZHIifSwib3V0cHV0cyI6eyJVcCBWb2x1bWUiOiIjMDBiMDYxIiwiRG93biBWb2x1bWUiOiIjRkYzMzNBIn0sInBhbmVsIjoiY2hhcnQiLCJwYXJhbWV0ZXJzIjp7IndpZHRoRmFjdG9yIjowLjQ1LCJjaGFydE5hbWUiOiJjaGFydCJ9fX0sInBhbmVscyI6eyJjaGFydCI6eyJwZXJjZW50IjoxLCJkaXNwbGF5IjoiXlZJWCIsImNoYXJ0TmFtZSI6ImNoYXJ0IiwidG9wIjowfX0sInNldFNwYW4iOnt9LCJsaW5lV2lkdGgiOjIsInN0cmlwZWRCYWNrZ3JvdWQiOnRydWUsImV2ZW50cyI6dHJ1ZSwiY29sb3IiOiIjMDA4MWYyIiwiZXZlbnRNYXAiOnsiY29ycG9yYXRlIjp7ImRpdnMiOnRydWUsInNwbGl0cyI6dHJ1ZX0sInNpZ0RldiI6e319LCJzeW1ib2xzIjpbeyJzeW1ib2wiOiJeVklYIiwic3ltYm9sT2JqZWN0Ijp7InN5bWJvbCI6Il5WSVgifSwicGVyaW9kaWNpdHkiOjEsImludGVydmFsIjoiZGF5IiwidGltZVVuaXQiOm51bGwsInNldFNwYW4iOnt9fV19)) in the summer of 2009.""" + +[[resources.sources]] +title = "Yahoo Finance VIX Data" +path = "https://finance.yahoo.com/chart/%5EVIX" + +[[resources]] # Path: penguins.json +path = "penguins.json" +description = """Palmer Archipelago (Antarctica) penguin data collected and made available by [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) and the [Palmer Station, Antarctica LTER](https://pal.lternet.edu/), a member of the [Long Term Ecological Research Network](https://lternet.edu/). For more information visit [allisonhorst/penguins](https://github.com/allisonhorst/penguins) on GitHub.""" + +[[resources.sources]] +title = "Palmer Station Antarctica LTER" +path = "https://pal.lternet.edu/" +[[resources.sources]] +title = "Allison Horst's Penguins Repository" +path = "https://github.com/allisonhorst/penguins" + +[[resources]] # Path: platformer-terrain.json +path = "platformer-terrain.json" +description = """Assets from the video game [Celeste](http://www.celestegame.com/).""" + +[[resources.sources]] +title = "Celeste Game" +path = "http://www.celestegame.com/" + +[[resources]] # Path: points.json +path = "points.json" + +[[resources]] # Path: political-contributions.json +path = "political-contributions.json" +description = """Summary financial information on contributions to candidates for U.S. elections. An updated version of this datset is available from the \"all candidates\" files (in pipe-delimited format) on the [bulk data download](https://www.fec.gov/data/browse-data/?tab=bulk-data) page of the U.S. Federal Election Commission, or, alternatively, via [OpenFEC](https://api.open.fec.gov/developers/). Information on each of the 25 columns is available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. +FEC data is subject to the commission's: + +- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/) +- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/) +- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md) +Additionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states: +> This project is in the public domain within the United States, and we waive worldwide copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/) dedication. Read more on our license page. A few restrictions limit the way you can use FEC data. For example, you can't use contributor lists for commercial purposes or to solicit donations. Learn more on [FEC.gov](https://www.fec.gov/).""" +[[resources.sources]] +title = "Federal Election Commission Bulk Data" +path = "https://www.fec.gov/data/browse-data/?tab=bulk-data" +[[resources.sources]] +title = "OpenFEC API" +path = "https://api.open.fec.gov/developers/" + +[[resources]] # Path: population.json +path = "population.json" +description = """United States population statistics by sex and age group across decades between 1850 and 2000. The dataset was obtained from [IPUMS USA](https://usa.ipums.org/usa/), which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790. +The dataset is structured as follows: + +- year: four-digit year of the survey. - [IPUMS description](https://usa.ipums.org/usa-action/variables/YEAR#description_section) +- age: age group in 5-year intervals (0 represents ages 0-4, 5 represents 5-9, 10 represents 10-14, etc., up to 90 representing 90 and above) - [IPUMS description](https://usa.ipums.org/usa-action/variables/AGE#description_section) +- sex: Sex (men = 1 / women = 2) - [IPUMS description](https://usa.ipums.org/usa-action/variables/SEX#description_section) +- people: Number of individuals, equivalent to IPUMS variable name [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section). +IPUMS updates and revises datasets over time, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). +When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). The organization requests the use of the following citation for this json file: +Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0""" +[[resources.sources]] +title = "IPUMS USA" +path = "https://usa.ipums.org/usa/" + +[[resources]] # Path: population_engineers_hurricanes.csv +path = "population_engineers_hurricanes.csv" +description = """Data about engineers from https://www.bls.gov/oes/tables.htm. Hurricane data from http://www.nhc.noaa.gov/paststate.shtml. Income data from https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table.""" + +[[resources.sources]] +title = "Bureau of Labor Statistics" +path = "https://www.bls.gov/oes/tables.htm" +[[resources.sources]] +title = "American Community Survey" +path = "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table" +[[resources.sources]] +title = "NOAA National Climatic Data Center" +path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" + +[[resources]] # Path: seattle-weather-hourly-normals.csv +path = "seattle-weather-hourly-normals.csv" +description = """Data from [NOAA](https://www.ncdc.noaa.gov/cdo-web/datatools/normals). Hourly weather normals with metric units. The 1981-2010 Climate Normals are NCDC's three-decade averages of climatological variables, including temperature and precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure and updated the format to be easier to parse.""" + +[[resources.sources]] +title = "NOAA National Climatic Data Center (NCDC)" +path = "https://www.ncdc.noaa.gov/cdo-web/datatools/normals" + +[[resources]] # Path: seattle-weather.csv +path = "seattle-weather.csv" +description = """Data from [NOAA](https://www.ncdc.noaa.gov/cdo-web/datatools/records). Daily weather records with metric units. Transformed using `/scripts/weather.py`. We synthesized the categorical \"weather\" field from multiple fields in the original dataset. This data is intended for instructional purposes.""" + +[[resources.sources]] +title = "NOAA National Climatic Data Center" +path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" + +[[resources]] # Path: sp500-2000.csv +path = "sp500-2000.csv" +description = """S&P 500 index values from 2000 to 2020, retrieved from [Yahoo Finance](https://finance.yahoo.com/quote/%5EDJI/history/).""" + +[[resources.sources]] +title = "Yahoo Finance" +path = "https://finance.yahoo.com/quote/%5EDJI/history/" + +[[resources]] # Path: sp500.csv +path = "sp500.csv" + +[[resources]] # Path: stocks.csv +path = "stocks.csv" + +[[resources]] # Path: udistrict.json +path = "udistrict.json" + +[[resources]] # Path: unemployment-across-industries.json +path = "unemployment-across-industries.json" +description = """Industry-level unemployment statistics from the [Current Population Survey](https://www.census.gov/programs-surveys/cps.html) (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons and unemployment rate across 11 private industries, as well as agricultural, government, and self-employed workers. Covers January 2000 through February 2010. Industry classification follows format of CPS [Table A-31](https://www.bls.gov/web/empsit/cpseea31.htm). +Each entry in the JSON file contains: + +- `series`: Industry name +- `year`: Year (2000-2010) +- `month`: Month (1-12) +- `count`: Number of unemployed persons (in thousands) +- `rate`: Unemployment rate (percentage) +- `date`: [ISO 8601](https://www.iso.org/iso-8601-date-and-time-format.html)-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\") +The dataset can be replicated using the BLS API. For more, see the `scripts` folder of this repository. +The BLS Web site states: +> \"Users of the public API should cite the date that data were accessed or retrieved using the API. Users must clearly state that “BLS.gov cannot vouch for the data or analyses derived from these data after the data have been retrieved from BLS.gov.” The BLS.gov logo may not be used by persons who are not BLS employees or on products (including web pages) that are not BLS-sponsored.\" +See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).""" +[[resources.sources]] +title = "U.S. Census Bureau Current Population Survey" +path = "https://www.census.gov/programs-surveys/cps.html" +[[resources.sources]] +title = "BLS Local Area Unemployment Statistics" +path = "https://www.bls.gov/lau/" +[[resources.sources]] +title = "BLS LAUS Data Tools" +path = "https://www.bls.gov/lau/data.htm" +[[resources.sources]] +title = "Bureau of Labor Statistics Table A-31" +path = "https://www.bls.gov/web/empsit/cpseea31.htm" + +[[resources]] # Path: unemployment.tsv +path = "unemployment.tsv" +description = """This dataset contains county-level unemployment rates in the United States, with data generally consistent with levels reported in 2009. The dataset is structured as tab-separated values with two columns: +1. `id`: The combined [state and county FIPS code](https://www.census.gov/library/reference/code-lists/ansi.html) +2. `rate`: The unemployment rate for the county + +The unemployment rate represents the number of unemployed persons as a percentage of the labor force. According to the [Bureau of Labor Statistics (BLS) glossary](https://www.bls.gov/opub/hom/glossary.htm#U): +> Unemployed persons (Current Population Survey) [are] persons aged 16 years and older who had no employment during the reference week, were available for work, except for temporary illness, and had made specific efforts to find employment sometime during the 4-week period ending with the reference week. Persons who were waiting to be recalled to a job from which they had been laid off need not have been looking for work to be classified as unemployed. +The labor force includes all persons classified as employed or unemployed in accordance with the BLS definitions. +This dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, a federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). The LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions, states, counties, metropolitan areas, and many cities and towns. +For the most up-to-date LAUS data: +1. **Monthly and Annual Data Downloads**: +- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) and [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data. +2. **BLS Public Data API**: +- The BLS provides an [API for developers](https://www.bls.gov/developers/) to access various datasets, including LAUS data. +- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query. +- API documentation and examples are available on the [BLS Developers](https://www.bls.gov/developers/) page. +When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm), which includes the following guidelines: +1. Cite the date that data were accessed or retrieved. +2. Acknowledge that \"BLS.gov cannot vouch for the data or analyses derived from these data after the data have been retrieved from BLS.gov.\" +3. Do not use the BLS logo without permission. +For detailed methodology and technical information about LAUS estimates, refer to the [BLS Handbook of Methods](https://www.bls.gov/opub/hom/lau/home.htm).""" +[[resources.sources]] +title = "BLS Handbook of Methods" +path = "https://www.bls.gov/opub/hom/lau/home.htm" +[[resources.sources]] +title = "BLS Developers API" +path = "https://www.bls.gov/developers/" + +[[resources]] # Path: uniform-2d.json +path = "uniform-2d.json" + +[[resources]] # Path: us-10m.json +path = "us-10m.json" + +[[resources]] # Path: us-employment.csv +path = "us-employment.csv" +description = """In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job losses across the United States. The downturn in employment, and the slow recovery in hiring that followed, was tracked each month by the [Current Employment Statistics](https://www.bls.gov/ces/) program at the U.S. Bureau of Labor Statistics. +This file contains the monthly employment total in a variety of job categories from January 2006 through December 2015. The numbers are seasonally adjusted and reported in thousands. The data were downloaded on Nov. 11, 2018, and reformatted for use in this library. + +Totals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector) tracked by the BLS. The \"nonfarm\" total is the category typically used by economists and journalists as a stand-in for the country's employment total. +A calculated \"nonfarm_change\" column has been appended with the month-to-month change in that supersector's employment. It is useful for illustrating how to make bar charts that report both negative and positive values.""" +[[resources.sources]] +title = "U.S. Bureau of Labor Statistics Current Employment Statistics" +path = "https://www.bls.gov/ces/" +[[resources.sources]] +title = "BLS Supersectors" +path = "https://download.bls.gov/pub/time.series/ce/ce.supersector" + +[[resources]] # Path: us-state-capitals.json +path = "us-state-capitals.json" + +[[resources]] # Path: volcano.json +path = "volcano.json" +description = """Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a topographic map by Ross Ihaka, adapted from [R datasets](https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html). These data should not be regarded as accurate.""" + +[[resources.sources]] +title = "R Datasets" +path = "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html" + +[[resources]] # Path: weather.csv +path = "weather.csv" +description = """Data from [NOAA](http://www.ncdc.noaa.gov/cdo-web/datatools/findstation). Transformed using `/scripts/weather.py`. We synthesized the categorical \"weather\" field from multiple fields in the original dataset. This data is intended for instructional purposes.""" + +[[resources.sources]] +title = "NOAA Climate Data Online" +path = "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation" + +[[resources]] # Path: weather.json +path = "weather.json" +description = """Instructional dataset showing actual and predicted temperature data.""" + +[[resources]] # Path: wheat.json +path = "wheat.json" +description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published [an elegant chart on the price of wheat](http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg). It plots 250 years of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that “never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.”""" + +[[resources.sources]] +title = "1822 Playfair Chart" +path = "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg" + +[[resources]] # Path: windvectors.csv +path = "windvectors.csv" +description = """Simulated wind patterns over northwestern Europe.""" + +[[resources]] # Path: world-110m.json +path = "world-110m.json" + +[[resources]] # Path: zipcodes.csv +path = "zipcodes.csv" +description = """GeoNames.org""" + +[[resources.sources]] +title = "GeoNames" +path = "https://www.geonames.org" From 42b5b255d72138c46f03fc5441ef28643de09510 Mon Sep 17 00:00:00 2001 From: ds <63077097+dsmedia@users.noreply.github.com> Date: Mon, 9 Dec 2024 21:26:51 -0500 Subject: [PATCH 02/40] feat: Migrate sources, licenses and column descriptions out of resource description --- SOURCES.toml | 291 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 178 insertions(+), 113 deletions(-) diff --git a/SOURCES.toml b/SOURCES.toml index c2585cf8..fe8d3be7 100644 --- a/SOURCES.toml +++ b/SOURCES.toml @@ -7,7 +7,7 @@ path = "airports.csv" [[resources]] # Path: annual-precip.json path = "annual-precip.json" -description = """A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell, from [CFSv2](https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2).""" +description = """A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.""" [[resources.sources]] title = "Climate Forecast System Version 2" @@ -19,7 +19,7 @@ description = """Graphs in Statistical Analysis, F. J. Anscombe, The American St [[resources]] # Path: barley.json path = "barley.json" -description = """The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption.\" R.A. Fisher's popularized its use in the field of statistics when he included it in his book [\"The Design of Experiments.\"](https://en.wikipedia.org/wiki/The_Design_of_Experiments) Since then it has been used to demonstrate new statistical techniques, including the [trellis charts](http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf) developed by Richard Becker, William Cleveland and others in the 1990s.""" +description = """The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption.\" R.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments.\" Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.""" [[resources.sources]] title = "The Design of Experiments Reference" @@ -30,7 +30,7 @@ path = "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf" [[resources]] # Path: birdstrikes.csv path = "birdstrikes.csv" -description = """http://wildlife.faa.gov""" +description = "Records of reported wildlife strikes received by the U.S. FAA" [[resources.sources]] title = "FAA Wildlife Strike Database" @@ -38,11 +38,10 @@ path = "http://wildlife.faa.gov" [[resources]] # Path: budget.json path = "budget.json" -description = """Source: Office of Management and Budget (U.S.) -[Budget FY 2016 - Receipts](https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3)""" +description = """Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.""" [[resources.sources]] -title = "Office of Management and Budget - Budget FY 2016" +title = "Office of Management and Budget - Budget FY 2016 - Receipts" path = "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3" [[resources]] # Path: budgets.json @@ -50,12 +49,13 @@ path = "budgets.json" [[resources]] # Path: burtin.json path = "burtin.json" -description = """The burtin.json dataset is based on graphic designer [Will Burtin's](https://en.wikipedia.org/wiki/Will_Burtin) 1951 visualization of antibiotic effectiveness, originally published in [Scope Magazine](https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/). The dataset compares the performance of three antibiotics against 16 different bacteria. The numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. The dataset was featured as an [example](https://mbostock.github.io/protovis/ex/antibiotics-burtin.html) in the Protovis project, a precursor to D3.js. The Protovis example notes that, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin.\" The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. +description = """The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. The dataset compares the performance of three antibiotics against 16 different bacteria. The numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. The dataset was featured as an example in the Protovis project, a precursor to D3.js. The Protovis example notes that, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin.\" The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) reads as follows: > ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin > > The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin. The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits the test organism. High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness. It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis. Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin. It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.""" + [[resources.sources]] title = "Scope Magazine" path = "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/" @@ -65,7 +65,7 @@ path = "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html" [[resources]] # Path: cars.json path = "cars.json" -description = """http://lib.stat.cmu.edu/datasets/""" +description = "Collection of car specifications and performance metrics from various automobile manufacturers." [[resources.sources]] title = "StatLib Datasets Archive" @@ -73,7 +73,7 @@ path = "http://lib.stat.cmu.edu/datasets/" [[resources]] # Path: co2-concentration.csv path = "co2-concentration.csv" -description = """https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record but modified to only include date, CO2, seasonally adjusted CO2 and only include rows with valid data.""" +description = """Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. Only includes rows with valid data.""" [[resources.sources]] title = "Scripps CO2 Program" @@ -81,40 +81,62 @@ path = "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record" [[resources]] # Path: countries.json path = "countries.json" -description = """- **Original Data**: [Gapminder Foundation](https://www.gapminder.org/) -- **URLs**: - -- Life Expectancy (v14): [Data](https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd004/) -- Fertility (v14): [Data](https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd008/) -- **Date Accessed**: July 31, 2024 -- **License**: Creative Commons Attribution 4.0 International (CC BY 4.0) | [Reference](https://www.gapminder.org/free-material/) -This dataset combines key demographic indicators (life expectancy at birth and fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year intervals. It includes both current values and adjacent time period values (previous and next) for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to \"show people the big picture\" rather than support detailed numeric analysis. -1. `year` (type: integer): Years from 1955 to 2000 at 5-year intervals -2. `country` (type: string): Name of the country -3. `fertility` (type: float): Fertility rate (average number of children per woman) for the given year -4. `life_expect` (type: float): Life expectancy in years for the given year -5. `p_fertility` (type: float): Fertility rate for the previous 5-year interval -6. `n_fertility` (type: float): Fertility rate for the next 5-year interval -7. `p_life_expect` (type: float): Life expectancy for the previous 5-year interval -8. `n_life_expect` (type: float): Life expectancy for the next 5-year interval""" +description = """This dataset combines key demographic indicators (life expectancy at birth and fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year intervals. It includes both current values and adjacent time period values (previous and next) for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to \"show people the big picture\" rather than support detailed numeric analysis.""" + +[resources.schema] +[[resources.schema.fields]] +name = "year" +description = "Years from 1955 to 2000 at 5-year intervals" + +[[resources.schema.fields]] +name = "country" +description = "Name of the country" + +[[resources.schema.fields]] +name = "fertility" +description = "Fertility rate (average number of children per woman) for the given year" + +[[resources.schema.fields]] +name = "life_expect" +description = "Life expectancy in years for the given year" + +[[resources.schema.fields]] +name = "p_fertility" +description = "Fertility rate for the previous 5-year interval" + +[[resources.schema.fields]] +name = "n_fertility" +description = "Fertility rate for the next 5-year interval" + +[[resources.schema.fields]] +name = "p_life_expect" +description = "Life expectancy for the previous 5-year interval" + +[[resources.schema.fields]] +name = "n_life_expect" +description = "Life expectancy for the next 5-year interval" + [[resources.sources]] title = "Gapminder Foundation - Life Expectancy" path = "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676" version = "14" + [[resources.sources]] title = "Gapminder Foundation - Fertility" path = "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676" version = "14" + [[resources.licenses]] title = "Creative Commons Attribution 4.0 International" path = "https://www.gapminder.org/free-material/" + [[resources]] # Path: crimea.json path = "crimea.json" [[resources]] # Path: disasters.csv path = "disasters.csv" -description = """https://ourworldindata.org/natural-catastrophes""" +description = "Annual number of deaths from disasters." [[resources.sources]] title = "Our World in Data - Natural Catastrophes" @@ -122,7 +144,6 @@ path = "https://ourworldindata.org/natural-catastrophes" [[resources]] # Path: driving.json path = "driving.json" -description = """https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html""" [[resources.sources]] title = "New York Times" @@ -130,8 +151,7 @@ path = "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/busine [[resources]] # Path: earthquakes.json path = "earthquakes.json" -description = """https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson -(Feb 6, 2018)""" +description = """Earthquake data retrieved Feb 6, 2018""" [[resources.sources]] title = "USGS Earthquake Feed" @@ -149,8 +169,7 @@ path = "flare.json" [[resources]] # Path: flights-10k.json path = "flights-10k.json" -description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr -Transformed using `/scripts/flights.py`""" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" @@ -158,8 +177,7 @@ path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_ [[resources]] # Path: flights-200k.arrow path = "flights-200k.arrow" -description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr -Transformed using `/scripts/flights.py`""" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" @@ -167,8 +185,7 @@ path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_ [[resources]] # Path: flights-200k.json path = "flights-200k.json" -description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr -Transformed using `/scripts/flights.py`""" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" @@ -176,8 +193,7 @@ path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_ [[resources]] # Path: flights-20k.json path = "flights-20k.json" -description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr -Transformed using `/scripts/flights.py`""" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" @@ -185,8 +201,7 @@ path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_ [[resources]] # Path: flights-2k.json path = "flights-2k.json" -description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr -Transformed using `/scripts/flights.py`""" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" @@ -194,8 +209,7 @@ path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_ [[resources]] # Path: flights-3m.parquet path = "flights-3m.parquet" -description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr -Transformed using `/scripts/flights.py`""" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" @@ -203,8 +217,7 @@ path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_ [[resources]] # Path: flights-5k.json path = "flights-5k.json" -description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr -Transformed using `/scripts/flights.py`""" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" @@ -212,8 +225,7 @@ path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_ [[resources]] # Path: flights-airport.csv path = "flights-airport.csv" -description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr -Transformed using `/scripts/flights.py`""" +description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" @@ -221,7 +233,7 @@ path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_ [[resources]] # Path: football.json path = "football.json" -description = """Football match outcomes across multiple divisions from 2013 to 2017. This dataset is a subset of a larger dataset from https://github.com/openfootball/football.json. The subset was made such that there are records for all five chosen divisions over the time period.""" +description = """Football match outcomes across multiple divisions from 2013 to 2017, part of a larger dataset from OpenFootball. The subset was made such that there are records for all five chosen divisions over the time period.""" [[resources.sources]] title = "OpenFootball" @@ -229,53 +241,97 @@ path = "https://github.com/openfootball/football.json" [[resources]] # Path: gapminder-health-income.csv path = "gapminder-health-income.csv" -description = """**Original Data**: [Gapminder Foundation](https://www.gapminder.org/) -**Description** Per-capita income, life expectancy, population and regional grouping. Dataset does not specify the reference year for the data. Gapminder historical data is subject to revisions. +description = """Per-capita income, life expectancy, population and regional grouping. Dataset does not specify the reference year for the data. Gapminder historical data is subject to revisions. Gapminder (v30, 2023) defines per-capita income as follows: ->\"This is real GDP per capita (gross domestic product per person adjusted for inflation) converted to international dollars using purchasing power parity rates. An international dollar has the same purchasing power over GDP as the U.S. dollar has in the United States.\" | [Source](https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268) -**License**: Creative Commons Attribution 4.0 International (CC BY 4.0) | [Reference](https://www.gapminder.org/free-material/)""" +>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) converted to international dollars using purchasing power parity rates. An international dollar has the same purchasing power over GDP as the U.S. dollar has in the United States.\"""" [[resources.sources]] title = "Gapminder Foundation" path = "https://www.gapminder.org" [[resources.sources]] title = "Gapminder GDP Per Capita Data" path = "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268" +[[resources.licenses]] +title = "Creative Commons Attribution 4.0 International" +path = "https://www.gapminder.org/free-material/" + [[resources]] # Path: gapminder.json +[[resources]] path = "gapminder.json" -description = """- **Original Data**: [Gapminder Foundation](https://www.gapminder.org/) -- **URLs**: - -- Life Expectancy (v14): [Data](https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd004/) -- Population (v7): [Data](https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd003/) -- Fertility (v14): [Data](https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd008/) -- Data Geographies (v2): [Data](https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158) | [Reference](https://www.gapminder.org/data/geo/) -- **Date Accessed**: July 11, 2024 -- **License**: Creative Commons Attribution 4.0 International (CC BY 4.0) | [Reference](https://www.gapminder.org/free-material/) -This dataset combines key demographic indicators (life expectancy at birth, population, and fertility rate measured as babies per woman) for various countries from 1955 to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable grouping countries. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to \"show people the big picture\" rather than support detailed numeric analysis. -1. `year` (type: integer): Years from 1955 to 2005 at 5-year intervals -2. `country` (type: string): Name of the country -3. `cluster` (type: integer): A categorical variable (values 0-5) grouping countries. See Revision Notes for details. -4. `pop` (type: integer): Population of the country -5. `life_expect` (type: float): Life expectancy in years -6. `fertility` (type: float): Fertility rate (average number of children per woman) +description = """This dataset combines key demographic indicators (life expectancy at birth, population, and fertility rate measured as babies per woman) for various countries from 1955 to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable grouping countries. Gapminder's data documentation notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to "show people the big picture" rather than support detailed numeric analysis. + +Notes: 1. Country Selection: The set of countries in this file matches the version of this dataset originally added to this collection in 2015. The specific criteria for country selection in that version are not known. Data for Aruba are no longer available in the new version. Hong Kong has been revised to Hong Kong, China in the new version. 2. Data Precision: The precision of float values may have changed from the original version. These changes reflect the most recent source data used for each indicator. -3. Regional Groupings: The 'cluster' column represents a regional mapping of countries corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To preserve continuity with previous versions of this dataset, we have retained the column name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: -`0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.""" +3. Regional Groupings: The 'cluster' column represents a regional mapping of countries corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To preserve continuity with previous versions of this dataset, we have retained the column name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.""" + +[resources.schema] +[[resources.schema.fields]] +name = "year" +description = "Years from 1955 to 2005 at 5-year intervals" + +[[resources.schema.fields]] +name = "country" +description = "Name of the country" + +[[resources.schema.fields]] +name = "cluster" +description = "A categorical variable (values 0-5) grouping countries by region" + +[[resources.schema.fields]] +name = "pop" +description = "Population of the country" + +[[resources.schema.fields]] +name = "life_expect" +description = "Life expectancy in years" + +[[resources.schema.fields]] +name = "fertility" +description = "Fertility rate (average number of children per woman" + +[[resources.sources]] +title = "Gapminder Foundation - Life Expectancy (Data)" +path = "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676" +version = "14" + +[[resources.sources]] +title = "Gapminder Foundatio - Life Expectancy (Documentation)" +path = "https://www.gapminder.org/data/documentation/gd004/" + [[resources.sources]] -title = "Gapminder Foundation - Population" +title = "Gapminder Foundation - Population (Data)" path = "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676" version = "7" + [[resources.sources]] -title = "Gapminder Foundation - Data Geographies" +title = "Gapminder Foundation - Population (Documentation)" +path = "https://www.gapminder.org/data/documentation/gd003/" + +[[resources.sources]] +title = "Gapminder Foundation - Fertility (Data)" +path = "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676" +version = "14" + +[[resources.sources]] +title = "Gapminder Foundation - Fertility Documentation (Documentation)" +path = "https://www.gapminder.org/data/documentation/gd008/" + +[[resources.sources]] +title = "Gapminder Foundation - Data Geographies (Data)" path = "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158" version = "2" + +[[resources.sources]] +title = "Gapminder Foundation - Data Geographies (Documentation)" +path = "https://www.gapminder.org/data/geo/" + [[resources.sources]] title = "Gapminder Data Documentation" path = "https://www.gapminder.org/data/documentation/" + [[resources]] # Path: gimp.png path = "gimp.png" description = """Application icons from open-source software projects.""" @@ -286,7 +342,7 @@ description = """Generated using `/scripts/github.py`.""" [[resources]] # Path: global-temp.csv path = "global-temp.csv" -description = """Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023. Source: NASA's Goddard Institute for Space Studies https://data.giss.nasa.gov/gistemp/""" +description = """Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.""" [[resources.sources]] title = "NASA Goddard Institute for Space Studies" @@ -297,7 +353,7 @@ path = "income.json" [[resources]] # Path: iowa-electricity.csv path = "iowa-electricity.csv" -description = """The state of Iowa has dramatically increased its production of renewable wind power in recent years. This file contains the annual net generation of electricity in the state by source in thousand megawatthours. The dataset was compiled by the [U.S. Energy Information Administration](https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin=) and downloaded on May 6, 2018. It is useful for illustrating stacked area charts.""" +description = """The state of Iowa has dramatically increased its production of renewable wind power in recent years. This file contains the annual net generation of electricity in the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. It is useful for illustrating stacked area charts.""" [[resources.sources]] title = "U.S. Energy Information Administration" @@ -307,7 +363,7 @@ path = "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fu path = "jobs.json" description = """U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790. Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). -Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of [IPUMS USA](https://usa.ipums.org/usa/), according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). +Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). The dataset is structured as follows: - job: The occupation title @@ -323,24 +379,26 @@ This dataset contains only summary statistics and does not include any underlyin 3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly. When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). The organization requests use of the following citation for this json file: Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0""" + [[resources.sources]] title = "IPUMS USA" path = "https://usa.ipums.org/usa/" +version = "6.0" [[resources]] # Path: la-riots.csv path = "la-riots.csv" -description = """More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic coordinates of their death. It was compiled and published by the [Los Angeles Times Data Desk](http://spreadsheets.latimes.com/la-riots-deaths/).""" +description = """More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic coordinates of their death. Compiled and published by the Los Angeles Times Data Desk.""" [[resources.sources]] -title = "Los Angeles Times Data Desk" +title = "LA Riots Deaths, Los Angeles Times Data Desk" path = "http://spreadsheets.latimes.com/la-riots-deaths/" [[resources]] # Path: londonBoroughs.json path = "londonBoroughs.json" -description = """Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile held at https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london. Original data \"contains National Statistics data © Crown copyright and database right (2015)\" and \"Contains Ordnance Survey data © Crown copyright and database right [2015].""" +description = """Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. Original data \"contains National Statistics data © Crown copyright and database right (2015)\" and \"Contains Ordnance Survey data © Crown copyright and database right [2015].""" [[resources.sources]] -title = "London Datastore" +title = "Statistical GIS Boundary Files, London Datastore" path = "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london" [[resources]] # Path: londonCentroids.json @@ -349,7 +407,7 @@ description = """Calculated from `londongBoroughs.json` using `d3.geoCentroid`." [[resources]] # Path: londonTubeLines.json path = "londonTubeLines.json" -description = """Selected rail lines simplified from `tfl_lines.json` at https://github.com/oobrien/vis/tree/master/tube/data""" +description = """Selected rail lines simplified from source.""" [[resources.sources]] title = "London Tube Data" @@ -381,11 +439,17 @@ These discrepancies align the `monarchs.json` dataset with the start and end dat The entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, the official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702. The `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, and the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` from the original dataset is retained for backwards compatibility. The dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689). -Source data has been verified against the [kings & queens](https://www.royal.uk/kings-and-queens-1066) and [interregnum](https://www.royal.uk/interregnum-1649-1660) [official website of the British royal family](https://www.royal.uk) pages of the official Web site of the British royal family (retrieved in Aug. 2024). Content on the site is protected by Crown Copyright. Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).""" +Source data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024). Content on the site is protected by Crown Copyright. Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).""" + [[resources.sources]] -title = "The Royal Family" +title = "The Royal Family - Kings & Queens" path = "https://www.royal.uk/kings-and-queens-1066" +[[resources.sources]] +title = "The Royal Family - Interregnum" +path = "https://www.royal.uk/interregnum-1649-1660" + + [[resources]] # Path: movies.json path = "movies.json" description = """The dataset has well known and intentionally included errors. This dataset is used for instructional purposes, including the need to reckon with dirty data.""" @@ -406,7 +470,7 @@ path = "https://finance.yahoo.com/chart/%5EVIX" [[resources]] # Path: penguins.json path = "penguins.json" -description = """Palmer Archipelago (Antarctica) penguin data collected and made available by [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) and the [Palmer Station, Antarctica LTER](https://pal.lternet.edu/), a member of the [Long Term Ecological Research Network](https://lternet.edu/). For more information visit [allisonhorst/penguins](https://github.com/allisonhorst/penguins) on GitHub.""" +description = """Palmer Archipelago (Antarctica) penguin data collected and made available by [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research Network](https://lternet.edu/).""" [[resources.sources]] title = "Palmer Station Antarctica LTER" @@ -417,7 +481,7 @@ path = "https://github.com/allisonhorst/penguins" [[resources]] # Path: platformer-terrain.json path = "platformer-terrain.json" -description = """Assets from the video game [Celeste](http://www.celestegame.com/).""" +description = """Assets from the video game Celeste.""" [[resources.sources]] title = "Celeste Game" @@ -428,7 +492,7 @@ path = "points.json" [[resources]] # Path: political-contributions.json path = "political-contributions.json" -description = """Summary financial information on contributions to candidates for U.S. elections. An updated version of this datset is available from the \"all candidates\" files (in pipe-delimited format) on the [bulk data download](https://www.fec.gov/data/browse-data/?tab=bulk-data) page of the U.S. Federal Election Commission, or, alternatively, via [OpenFEC](https://api.open.fec.gov/developers/). Information on each of the 25 columns is available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. +description = """Summary financial information on contributions to candidates for U.S. elections. An updated version of this datset is available from the \"all candidates\" files (in pipe-delimited format) on the bulk data download page of the U.S. Federal Election Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. FEC data is subject to the commission's: - [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/) @@ -445,7 +509,7 @@ path = "https://api.open.fec.gov/developers/" [[resources]] # Path: population.json path = "population.json" -description = """United States population statistics by sex and age group across decades between 1850 and 2000. The dataset was obtained from [IPUMS USA](https://usa.ipums.org/usa/), which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790. +description = """United States population statistics by sex and age group across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790. The dataset is structured as follows: - year: four-digit year of the survey. - [IPUMS description](https://usa.ipums.org/usa-action/variables/YEAR#description_section) @@ -461,7 +525,7 @@ path = "https://usa.ipums.org/usa/" [[resources]] # Path: population_engineers_hurricanes.csv path = "population_engineers_hurricanes.csv" -description = """Data about engineers from https://www.bls.gov/oes/tables.htm. Hurricane data from http://www.nhc.noaa.gov/paststate.shtml. Income data from https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table.""" +description = """Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, [Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)""" [[resources.sources]] title = "Bureau of Labor Statistics" @@ -475,7 +539,7 @@ path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" [[resources]] # Path: seattle-weather-hourly-normals.csv path = "seattle-weather-hourly-normals.csv" -description = """Data from [NOAA](https://www.ncdc.noaa.gov/cdo-web/datatools/normals). Hourly weather normals with metric units. The 1981-2010 Climate Normals are NCDC's three-decade averages of climatological variables, including temperature and precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure and updated the format to be easier to parse.""" +description = """Hourly weather normals with metric units. The 1981-2010 Climate Normals are NCDC's three-decade averages of climatological variables, including temperature and precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure and updated the format to be easier to parse.""" [[resources.sources]] title = "NOAA National Climatic Data Center (NCDC)" @@ -483,7 +547,7 @@ path = "https://www.ncdc.noaa.gov/cdo-web/datatools/normals" [[resources]] # Path: seattle-weather.csv path = "seattle-weather.csv" -description = """Data from [NOAA](https://www.ncdc.noaa.gov/cdo-web/datatools/records). Daily weather records with metric units. Transformed using `/scripts/weather.py`. We synthesized the categorical \"weather\" field from multiple fields in the original dataset. This data is intended for instructional purposes.""" +description = """Daily weather records with metric units. Transformed using `/scripts/weather.py`. The categorical \"weather\" field is synthesized from multiple fields in the original dataset. This data is intended for instructional purposes.""" [[resources.sources]] title = "NOAA National Climatic Data Center" @@ -491,7 +555,7 @@ path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" [[resources]] # Path: sp500-2000.csv path = "sp500-2000.csv" -description = """S&P 500 index values from 2000 to 2020, retrieved from [Yahoo Finance](https://finance.yahoo.com/quote/%5EDJI/history/).""" +description = """S&P 500 index values from 2000 to 2020.""" [[resources.sources]] title = "Yahoo Finance" @@ -508,7 +572,7 @@ path = "udistrict.json" [[resources]] # Path: unemployment-across-industries.json path = "unemployment-across-industries.json" -description = """Industry-level unemployment statistics from the [Current Population Survey](https://www.census.gov/programs-surveys/cps.html) (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons and unemployment rate across 11 private industries, as well as agricultural, government, and self-employed workers. Covers January 2000 through February 2010. Industry classification follows format of CPS [Table A-31](https://www.bls.gov/web/empsit/cpseea31.htm). +description = """Industry-level unemployment statistics from the Current Population Survey (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons and unemployment rate across 11 private industries, as well as agricultural, government, and self-employed workers. Covers January 2000 through February 2010. Industry classification follows format of CPS Table A-31. Each entry in the JSON file contains: - `series`: Industry name @@ -525,9 +589,6 @@ See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.ht title = "U.S. Census Bureau Current Population Survey" path = "https://www.census.gov/programs-surveys/cps.html" [[resources.sources]] -title = "BLS Local Area Unemployment Statistics" -path = "https://www.bls.gov/lau/" -[[resources.sources]] title = "BLS LAUS Data Tools" path = "https://www.bls.gov/lau/data.htm" [[resources.sources]] @@ -536,33 +597,40 @@ path = "https://www.bls.gov/web/empsit/cpseea31.htm" [[resources]] # Path: unemployment.tsv path = "unemployment.tsv" -description = """This dataset contains county-level unemployment rates in the United States, with data generally consistent with levels reported in 2009. The dataset is structured as tab-separated values with two columns: -1. `id`: The combined [state and county FIPS code](https://www.census.gov/library/reference/code-lists/ansi.html) -2. `rate`: The unemployment rate for the county +description = """This dataset contains county-level unemployment rates in the United States, with data generally consistent with levels reported in 2009. The dataset is structured as tab-separated values. The unemployment rate represents the number of unemployed persons as a percentage of the labor force. According to the [Bureau of Labor Statistics (BLS) glossary](https://www.bls.gov/opub/hom/glossary.htm#U): > Unemployed persons (Current Population Survey) [are] persons aged 16 years and older who had no employment during the reference week, were available for work, except for temporary illness, and had made specific efforts to find employment sometime during the 4-week period ending with the reference week. Persons who were waiting to be recalled to a job from which they had been laid off need not have been looking for work to be classified as unemployed. -The labor force includes all persons classified as employed or unemployed in accordance with the BLS definitions. + This dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, a federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). The LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions, states, counties, metropolitan areas, and many cities and towns. + For the most up-to-date LAUS data: 1. **Monthly and Annual Data Downloads**: - Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) and [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data. 2. **BLS Public Data API**: -- The BLS provides an [API for developers](https://www.bls.gov/developers/) to access various datasets, including LAUS data. +- The BLS provides an API for developers to access various datasets, including LAUS data. - To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query. -- API documentation and examples are available on the [BLS Developers](https://www.bls.gov/developers/) page. -When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm), which includes the following guidelines: -1. Cite the date that data were accessed or retrieved. -2. Acknowledge that \"BLS.gov cannot vouch for the data or analyses derived from these data after the data have been retrieved from BLS.gov.\" -3. Do not use the BLS logo without permission. -For detailed methodology and technical information about LAUS estimates, refer to the [BLS Handbook of Methods](https://www.bls.gov/opub/hom/lau/home.htm).""" -[[resources.sources]] -title = "BLS Handbook of Methods" -path = "https://www.bls.gov/opub/hom/lau/home.htm" +- API documentation and examples are available on the BLS Developers page. + +When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).""" + +[resources.schema] +[[resources.schema.fields]] +name = "id" +description = "The combined state and county FIPS code" + +[[resources.schema.fields]] +name = "rate" +description = "The unemployment rate for the county" + [[resources.sources]] title = "BLS Developers API" path = "https://www.bls.gov/developers/" +[[resources.sources]] +title = "BLS Handbook of Methods" +path = "https://www.bls.gov/opub/hom/lau/home.htm" + [[resources]] # Path: uniform-2d.json path = "uniform-2d.json" @@ -571,7 +639,7 @@ path = "us-10m.json" [[resources]] # Path: us-employment.csv path = "us-employment.csv" -description = """In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job losses across the United States. The downturn in employment, and the slow recovery in hiring that followed, was tracked each month by the [Current Employment Statistics](https://www.bls.gov/ces/) program at the U.S. Bureau of Labor Statistics. +description = """In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job losses across the United States. The downturn in employment, and the slow recovery in hiring that followed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau of Labor Statistics. This file contains the monthly employment total in a variety of job categories from January 2006 through December 2015. The numbers are seasonally adjusted and reported in thousands. The data were downloaded on Nov. 11, 2018, and reformatted for use in this library. Totals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector) tracked by the BLS. The \"nonfarm\" total is the category typically used by economists and journalists as a stand-in for the country's employment total. @@ -579,16 +647,13 @@ A calculated \"nonfarm_change\" column has been appended with the month-to-month [[resources.sources]] title = "U.S. Bureau of Labor Statistics Current Employment Statistics" path = "https://www.bls.gov/ces/" -[[resources.sources]] -title = "BLS Supersectors" -path = "https://download.bls.gov/pub/time.series/ce/ce.supersector" [[resources]] # Path: us-state-capitals.json path = "us-state-capitals.json" [[resources]] # Path: volcano.json path = "volcano.json" -description = """Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a topographic map by Ross Ihaka, adapted from [R datasets](https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html). These data should not be regarded as accurate.""" +description = """Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.""" [[resources.sources]] title = "R Datasets" @@ -596,7 +661,7 @@ path = "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.ht [[resources]] # Path: weather.csv path = "weather.csv" -description = """Data from [NOAA](http://www.ncdc.noaa.gov/cdo-web/datatools/findstation). Transformed using `/scripts/weather.py`. We synthesized the categorical \"weather\" field from multiple fields in the original dataset. This data is intended for instructional purposes.""" +description = """NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized from multiple fields in the original dataset. This data is intended for instructional purposes.""" [[resources.sources]] title = "NOAA Climate Data Online" @@ -608,7 +673,7 @@ description = """Instructional dataset showing actual and predicted temperature [[resources]] # Path: wheat.json path = "wheat.json" -description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published [an elegant chart on the price of wheat](http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg). It plots 250 years of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that “never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.”""" +description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published an elegant chart on the price of wheat. It plots 250 years of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that “never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.”""" [[resources.sources]] title = "1822 Playfair Chart" From 5ccfae39f75187ac29c8a124104a55aa5f30c333 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 10 Dec 2024 14:58:27 +0000 Subject: [PATCH 03/40] fix: Remove empty `[[resources]]` --- SOURCES.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/SOURCES.toml b/SOURCES.toml index fe8d3be7..9f7866d8 100644 --- a/SOURCES.toml +++ b/SOURCES.toml @@ -256,8 +256,7 @@ title = "Creative Commons Attribution 4.0 International" path = "https://www.gapminder.org/free-material/" -[[resources]] # Path: gapminder.json -[[resources]] +[[resources]] # Path: gapminder.json path = "gapminder.json" description = """This dataset combines key demographic indicators (life expectancy at birth, population, and fertility rate measured as babies per woman) for various countries from 1955 to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable grouping countries. Gapminder's data documentation notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to "show people the big picture" rather than support detailed numeric analysis. From a2b3be0e49988a7a9411ddb93d7194d42b449f9c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:07:37 +0000 Subject: [PATCH 04/40] style: run default `taplo fmt ...` Uses single space before comments --- SOURCES.toml | 144 +++++++++++++++++++++++++-------------------------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/SOURCES.toml b/SOURCES.toml index 9f7866d8..2a7e41be 100644 --- a/SOURCES.toml +++ b/SOURCES.toml @@ -1,11 +1,11 @@ -[[resources]] # Path: 7zip.png +[[resources]] # Path: 7zip.png path = "7zip.png" description = """Application icons from open-source software projects.""" -[[resources]] # Path: airports.csv +[[resources]] # Path: airports.csv path = "airports.csv" -[[resources]] # Path: annual-precip.json +[[resources]] # Path: annual-precip.json path = "annual-precip.json" description = """A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.""" @@ -13,11 +13,11 @@ description = """A raster grid of global annual precipitation for the year 2016 title = "Climate Forecast System Version 2" path = "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2" -[[resources]] # Path: anscombe.json +[[resources]] # Path: anscombe.json path = "anscombe.json" description = """Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.""" -[[resources]] # Path: barley.json +[[resources]] # Path: barley.json path = "barley.json" description = """The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption.\" R.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments.\" Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.""" @@ -28,7 +28,7 @@ path = "https://en.wikipedia.org/wiki/The_Design_of_Experiments" title = "Trellis Charts Paper" path = "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf" -[[resources]] # Path: birdstrikes.csv +[[resources]] # Path: birdstrikes.csv path = "birdstrikes.csv" description = "Records of reported wildlife strikes received by the U.S. FAA" @@ -36,7 +36,7 @@ description = "Records of reported wildlife strikes received by the U.S. FAA" title = "FAA Wildlife Strike Database" path = "http://wildlife.faa.gov" -[[resources]] # Path: budget.json +[[resources]] # Path: budget.json path = "budget.json" description = """Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.""" @@ -44,10 +44,10 @@ description = """Historical and forecasted federal revenue/receipts produced in title = "Office of Management and Budget - Budget FY 2016 - Receipts" path = "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3" -[[resources]] # Path: budgets.json +[[resources]] # Path: budgets.json path = "budgets.json" -[[resources]] # Path: burtin.json +[[resources]] # Path: burtin.json path = "burtin.json" description = """The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. The dataset compares the performance of three antibiotics against 16 different bacteria. The numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. The dataset was featured as an example in the Protovis project, a precursor to D3.js. The Protovis example notes that, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin.\" The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) reads as follows: @@ -63,7 +63,7 @@ path = "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/" title = "Protovis Antibiotics Example" path = "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html" -[[resources]] # Path: cars.json +[[resources]] # Path: cars.json path = "cars.json" description = "Collection of car specifications and performance metrics from various automobile manufacturers." @@ -71,7 +71,7 @@ description = "Collection of car specifications and performance metrics from var title = "StatLib Datasets Archive" path = "http://lib.stat.cmu.edu/datasets/" -[[resources]] # Path: co2-concentration.csv +[[resources]] # Path: co2-concentration.csv path = "co2-concentration.csv" description = """Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. Only includes rows with valid data.""" @@ -79,7 +79,7 @@ description = """Scripps CO2 program data ut modified to only include date, CO2, title = "Scripps CO2 Program" path = "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record" -[[resources]] # Path: countries.json +[[resources]] # Path: countries.json path = "countries.json" description = """This dataset combines key demographic indicators (life expectancy at birth and fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year intervals. It includes both current values and adjacent time period values (previous and next) for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to \"show people the big picture\" rather than support detailed numeric analysis.""" @@ -131,10 +131,10 @@ title = "Creative Commons Attribution 4.0 International" path = "https://www.gapminder.org/free-material/" -[[resources]] # Path: crimea.json +[[resources]] # Path: crimea.json path = "crimea.json" -[[resources]] # Path: disasters.csv +[[resources]] # Path: disasters.csv path = "disasters.csv" description = "Annual number of deaths from disasters." @@ -142,14 +142,14 @@ description = "Annual number of deaths from disasters." title = "Our World in Data - Natural Catastrophes" path = "https://ourworldindata.org/natural-catastrophes" -[[resources]] # Path: driving.json +[[resources]] # Path: driving.json path = "driving.json" [[resources.sources]] title = "New York Times" path = "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html" -[[resources]] # Path: earthquakes.json +[[resources]] # Path: earthquakes.json path = "earthquakes.json" description = """Earthquake data retrieved Feb 6, 2018""" @@ -157,17 +157,17 @@ description = """Earthquake data retrieved Feb 6, 2018""" title = "USGS Earthquake Feed" path = "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson" -[[resources]] # Path: ffox.png +[[resources]] # Path: ffox.png path = "ffox.png" description = """Application icons from open-source software projects.""" -[[resources]] # Path: flare-dependencies.json +[[resources]] # Path: flare-dependencies.json path = "flare-dependencies.json" -[[resources]] # Path: flare.json +[[resources]] # Path: flare.json path = "flare.json" -[[resources]] # Path: flights-10k.json +[[resources]] # Path: flights-10k.json path = "flights-10k.json" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" @@ -175,7 +175,7 @@ description = """Flight delay statistics from U.S. Bureau of Transportation Stat title = "U.S. Bureau of Transportation Statistics" path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" -[[resources]] # Path: flights-200k.arrow +[[resources]] # Path: flights-200k.arrow path = "flights-200k.arrow" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" @@ -183,7 +183,7 @@ description = """Flight delay statistics from U.S. Bureau of Transportation Stat title = "U.S. Bureau of Transportation Statistics" path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" -[[resources]] # Path: flights-200k.json +[[resources]] # Path: flights-200k.json path = "flights-200k.json" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" @@ -191,7 +191,7 @@ description = """Flight delay statistics from U.S. Bureau of Transportation Stat title = "U.S. Bureau of Transportation Statistics" path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" -[[resources]] # Path: flights-20k.json +[[resources]] # Path: flights-20k.json path = "flights-20k.json" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" @@ -199,7 +199,7 @@ description = """Flight delay statistics from U.S. Bureau of Transportation Stat title = "U.S. Bureau of Transportation Statistics" path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" -[[resources]] # Path: flights-2k.json +[[resources]] # Path: flights-2k.json path = "flights-2k.json" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" @@ -207,7 +207,7 @@ description = """Flight delay statistics from U.S. Bureau of Transportation Stat title = "U.S. Bureau of Transportation Statistics" path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" -[[resources]] # Path: flights-3m.parquet +[[resources]] # Path: flights-3m.parquet path = "flights-3m.parquet" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" @@ -215,7 +215,7 @@ description = """Flight delay statistics from U.S. Bureau of Transportation Stat title = "U.S. Bureau of Transportation Statistics" path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" -[[resources]] # Path: flights-5k.json +[[resources]] # Path: flights-5k.json path = "flights-5k.json" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" @@ -223,7 +223,7 @@ description = """Flight delay statistics from U.S. Bureau of Transportation Stat title = "U.S. Bureau of Transportation Statistics" path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" -[[resources]] # Path: flights-airport.csv +[[resources]] # Path: flights-airport.csv path = "flights-airport.csv" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" @@ -231,7 +231,7 @@ description = """Flight delay statistics from U.S. Bureau of Transportation Stat title = "U.S. Bureau of Transportation Statistics" path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" -[[resources]] # Path: football.json +[[resources]] # Path: football.json path = "football.json" description = """Football match outcomes across multiple divisions from 2013 to 2017, part of a larger dataset from OpenFootball. The subset was made such that there are records for all five chosen divisions over the time period.""" @@ -239,7 +239,7 @@ description = """Football match outcomes across multiple divisions from 2013 to title = "OpenFootball" path = "https://github.com/openfootball/football.json" -[[resources]] # Path: gapminder-health-income.csv +[[resources]] # Path: gapminder-health-income.csv path = "gapminder-health-income.csv" description = """Per-capita income, life expectancy, population and regional grouping. Dataset does not specify the reference year for the data. Gapminder historical data is subject to revisions. @@ -331,15 +331,15 @@ title = "Gapminder Data Documentation" path = "https://www.gapminder.org/data/documentation/" -[[resources]] # Path: gimp.png +[[resources]] # Path: gimp.png path = "gimp.png" description = """Application icons from open-source software projects.""" -[[resources]] # Path: github.csv +[[resources]] # Path: github.csv path = "github.csv" description = """Generated using `/scripts/github.py`.""" -[[resources]] # Path: global-temp.csv +[[resources]] # Path: global-temp.csv path = "global-temp.csv" description = """Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.""" @@ -347,10 +347,10 @@ description = """Combined Land-Surface Air and Sea-Surface Water Temperature Ano title = "NASA Goddard Institute for Space Studies" path = "https://data.giss.nasa.gov/gistemp/" -[[resources]] # Path: income.json +[[resources]] # Path: income.json path = "income.json" -[[resources]] # Path: iowa-electricity.csv +[[resources]] # Path: iowa-electricity.csv path = "iowa-electricity.csv" description = """The state of Iowa has dramatically increased its production of renewable wind power in recent years. This file contains the annual net generation of electricity in the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. It is useful for illustrating stacked area charts.""" @@ -358,7 +358,7 @@ description = """The state of Iowa has dramatically increased its production of title = "U.S. Energy Information Administration" path = "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin=" -[[resources]] # Path: jobs.json +[[resources]] # Path: jobs.json path = "jobs.json" description = """U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790. Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). @@ -384,7 +384,7 @@ title = "IPUMS USA" path = "https://usa.ipums.org/usa/" version = "6.0" -[[resources]] # Path: la-riots.csv +[[resources]] # Path: la-riots.csv path = "la-riots.csv" description = """More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic coordinates of their death. Compiled and published by the Los Angeles Times Data Desk.""" @@ -392,7 +392,7 @@ description = """More than 60 people lost their lives amid the looting and fires title = "LA Riots Deaths, Los Angeles Times Data Desk" path = "http://spreadsheets.latimes.com/la-riots-deaths/" -[[resources]] # Path: londonBoroughs.json +[[resources]] # Path: londonBoroughs.json path = "londonBoroughs.json" description = """Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. Original data \"contains National Statistics data © Crown copyright and database right (2015)\" and \"Contains Ordnance Survey data © Crown copyright and database right [2015].""" @@ -400,11 +400,11 @@ description = """Boundaries of London boroughs reprojected and simplified from ` title = "Statistical GIS Boundary Files, London Datastore" path = "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london" -[[resources]] # Path: londonCentroids.json +[[resources]] # Path: londonCentroids.json path = "londonCentroids.json" description = """Calculated from `londongBoroughs.json` using `d3.geoCentroid`.""" -[[resources]] # Path: londonTubeLines.json +[[resources]] # Path: londonTubeLines.json path = "londonTubeLines.json" description = """Selected rail lines simplified from source.""" @@ -412,16 +412,16 @@ description = """Selected rail lines simplified from source.""" title = "London Tube Data" path = "https://github.com/oobrien/vis/tree/master/tube/data" -[[resources]] # Path: lookup_groups.csv +[[resources]] # Path: lookup_groups.csv path = "lookup_groups.csv" -[[resources]] # Path: lookup_people.csv +[[resources]] # Path: lookup_people.csv path = "lookup_people.csv" -[[resources]] # Path: miserables.json +[[resources]] # Path: miserables.json path = "miserables.json" -[[resources]] # Path: monarchs.json +[[resources]] # Path: monarchs.json path = "monarchs.json" description = """A chronological list of English and British monarchs from Elizabeth I through George IV. Each entry includes: @@ -449,17 +449,17 @@ title = "The Royal Family - Interregnum" path = "https://www.royal.uk/interregnum-1649-1660" -[[resources]] # Path: movies.json +[[resources]] # Path: movies.json path = "movies.json" description = """The dataset has well known and intentionally included errors. This dataset is used for instructional purposes, including the need to reckon with dirty data.""" -[[resources]] # Path: normal-2d.json +[[resources]] # Path: normal-2d.json path = "normal-2d.json" -[[resources]] # Path: obesity.json +[[resources]] # Path: obesity.json path = "obesity.json" -[[resources]] # Path: ohlc.json +[[resources]] # Path: ohlc.json path = "ohlc.json" description = """This dataset contains the performance of the Chicago Board Options Exchange [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX?ltr=1#eyJpbnRlcnZhbCI6ImRheSIsInBlcmlvZGljaXR5IjoxLCJ0aW1lVW5pdCI6bnVsbCwiY2FuZGxlV2lkdGgiOjgsInZvbHVtZVVuZGVybGF5Ijp0cnVlLCJhZGoiOnRydWUsImNyb3NzaGFpciI6dHJ1ZSwiY2hhcnRUeXBlIjoibGluZSIsImV4dGVuZGVkIjpmYWxzZSwibWFya2V0U2Vzc2lvbnMiOnt9LCJhZ2dyZWdhdGlvblR5cGUiOiJvaGxjIiwiY2hhcnRTY2FsZSI6ImxpbmVhciIsInN0dWRpZXMiOnsidm9sIHVuZHIiOnsidHlwZSI6InZvbCB1bmRyIiwiaW5wdXRzIjp7ImlkIjoidm9sIHVuZHIiLCJkaXNwbGF5Ijoidm9sIHVuZHIifSwib3V0cHV0cyI6eyJVcCBWb2x1bWUiOiIjMDBiMDYxIiwiRG93biBWb2x1bWUiOiIjRkYzMzNBIn0sInBhbmVsIjoiY2hhcnQiLCJwYXJhbWV0ZXJzIjp7IndpZHRoRmFjdG9yIjowLjQ1LCJjaGFydE5hbWUiOiJjaGFydCJ9fX0sInBhbmVscyI6eyJjaGFydCI6eyJwZXJjZW50IjoxLCJkaXNwbGF5IjoiXlZJWCIsImNoYXJ0TmFtZSI6ImNoYXJ0IiwidG9wIjowfX0sInNldFNwYW4iOnt9LCJsaW5lV2lkdGgiOjIsInN0cmlwZWRCYWNrZ3JvdWQiOnRydWUsImV2ZW50cyI6dHJ1ZSwiY29sb3IiOiIjMDA4MWYyIiwiZXZlbnRNYXAiOnsiY29ycG9yYXRlIjp7ImRpdnMiOnRydWUsInNwbGl0cyI6dHJ1ZX0sInNpZ0RldiI6e319LCJzeW1ib2xzIjpbeyJzeW1ib2wiOiJeVklYIiwic3ltYm9sT2JqZWN0Ijp7InN5bWJvbCI6Il5WSVgifSwicGVyaW9kaWNpdHkiOjEsImludGVydmFsIjoiZGF5IiwidGltZVVuaXQiOm51bGwsInNldFNwYW4iOnt9fV19)) in the summer of 2009.""" @@ -467,7 +467,7 @@ description = """This dataset contains the performance of the Chicago Board Opti title = "Yahoo Finance VIX Data" path = "https://finance.yahoo.com/chart/%5EVIX" -[[resources]] # Path: penguins.json +[[resources]] # Path: penguins.json path = "penguins.json" description = """Palmer Archipelago (Antarctica) penguin data collected and made available by [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research Network](https://lternet.edu/).""" @@ -478,7 +478,7 @@ path = "https://pal.lternet.edu/" title = "Allison Horst's Penguins Repository" path = "https://github.com/allisonhorst/penguins" -[[resources]] # Path: platformer-terrain.json +[[resources]] # Path: platformer-terrain.json path = "platformer-terrain.json" description = """Assets from the video game Celeste.""" @@ -486,10 +486,10 @@ description = """Assets from the video game Celeste.""" title = "Celeste Game" path = "http://www.celestegame.com/" -[[resources]] # Path: points.json +[[resources]] # Path: points.json path = "points.json" -[[resources]] # Path: political-contributions.json +[[resources]] # Path: political-contributions.json path = "political-contributions.json" description = """Summary financial information on contributions to candidates for U.S. elections. An updated version of this datset is available from the \"all candidates\" files (in pipe-delimited format) on the bulk data download page of the U.S. Federal Election Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. FEC data is subject to the commission's: @@ -506,7 +506,7 @@ path = "https://www.fec.gov/data/browse-data/?tab=bulk-data" title = "OpenFEC API" path = "https://api.open.fec.gov/developers/" -[[resources]] # Path: population.json +[[resources]] # Path: population.json path = "population.json" description = """United States population statistics by sex and age group across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790. The dataset is structured as follows: @@ -522,7 +522,7 @@ Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. title = "IPUMS USA" path = "https://usa.ipums.org/usa/" -[[resources]] # Path: population_engineers_hurricanes.csv +[[resources]] # Path: population_engineers_hurricanes.csv path = "population_engineers_hurricanes.csv" description = """Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, [Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)""" @@ -536,7 +536,7 @@ path = "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview. title = "NOAA National Climatic Data Center" path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" -[[resources]] # Path: seattle-weather-hourly-normals.csv +[[resources]] # Path: seattle-weather-hourly-normals.csv path = "seattle-weather-hourly-normals.csv" description = """Hourly weather normals with metric units. The 1981-2010 Climate Normals are NCDC's three-decade averages of climatological variables, including temperature and precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure and updated the format to be easier to parse.""" @@ -544,7 +544,7 @@ description = """Hourly weather normals with metric units. The 1981-2010 Climate title = "NOAA National Climatic Data Center (NCDC)" path = "https://www.ncdc.noaa.gov/cdo-web/datatools/normals" -[[resources]] # Path: seattle-weather.csv +[[resources]] # Path: seattle-weather.csv path = "seattle-weather.csv" description = """Daily weather records with metric units. Transformed using `/scripts/weather.py`. The categorical \"weather\" field is synthesized from multiple fields in the original dataset. This data is intended for instructional purposes.""" @@ -552,7 +552,7 @@ description = """Daily weather records with metric units. Transformed using `/sc title = "NOAA National Climatic Data Center" path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" -[[resources]] # Path: sp500-2000.csv +[[resources]] # Path: sp500-2000.csv path = "sp500-2000.csv" description = """S&P 500 index values from 2000 to 2020.""" @@ -560,16 +560,16 @@ description = """S&P 500 index values from 2000 to 2020.""" title = "Yahoo Finance" path = "https://finance.yahoo.com/quote/%5EDJI/history/" -[[resources]] # Path: sp500.csv +[[resources]] # Path: sp500.csv path = "sp500.csv" -[[resources]] # Path: stocks.csv +[[resources]] # Path: stocks.csv path = "stocks.csv" -[[resources]] # Path: udistrict.json +[[resources]] # Path: udistrict.json path = "udistrict.json" -[[resources]] # Path: unemployment-across-industries.json +[[resources]] # Path: unemployment-across-industries.json path = "unemployment-across-industries.json" description = """Industry-level unemployment statistics from the Current Population Survey (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons and unemployment rate across 11 private industries, as well as agricultural, government, and self-employed workers. Covers January 2000 through February 2010. Industry classification follows format of CPS Table A-31. Each entry in the JSON file contains: @@ -594,7 +594,7 @@ path = "https://www.bls.gov/lau/data.htm" title = "Bureau of Labor Statistics Table A-31" path = "https://www.bls.gov/web/empsit/cpseea31.htm" -[[resources]] # Path: unemployment.tsv +[[resources]] # Path: unemployment.tsv path = "unemployment.tsv" description = """This dataset contains county-level unemployment rates in the United States, with data generally consistent with levels reported in 2009. The dataset is structured as tab-separated values. @@ -630,13 +630,13 @@ path = "https://www.bls.gov/developers/" title = "BLS Handbook of Methods" path = "https://www.bls.gov/opub/hom/lau/home.htm" -[[resources]] # Path: uniform-2d.json +[[resources]] # Path: uniform-2d.json path = "uniform-2d.json" -[[resources]] # Path: us-10m.json +[[resources]] # Path: us-10m.json path = "us-10m.json" -[[resources]] # Path: us-employment.csv +[[resources]] # Path: us-employment.csv path = "us-employment.csv" description = """In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job losses across the United States. The downturn in employment, and the slow recovery in hiring that followed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau of Labor Statistics. This file contains the monthly employment total in a variety of job categories from January 2006 through December 2015. The numbers are seasonally adjusted and reported in thousands. The data were downloaded on Nov. 11, 2018, and reformatted for use in this library. @@ -647,10 +647,10 @@ A calculated \"nonfarm_change\" column has been appended with the month-to-month title = "U.S. Bureau of Labor Statistics Current Employment Statistics" path = "https://www.bls.gov/ces/" -[[resources]] # Path: us-state-capitals.json +[[resources]] # Path: us-state-capitals.json path = "us-state-capitals.json" -[[resources]] # Path: volcano.json +[[resources]] # Path: volcano.json path = "volcano.json" description = """Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.""" @@ -658,7 +658,7 @@ description = """Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckla title = "R Datasets" path = "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html" -[[resources]] # Path: weather.csv +[[resources]] # Path: weather.csv path = "weather.csv" description = """NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized from multiple fields in the original dataset. This data is intended for instructional purposes.""" @@ -666,11 +666,11 @@ description = """NOAA data transformed using `/scripts/weather.py`. Categorical title = "NOAA Climate Data Online" path = "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation" -[[resources]] # Path: weather.json +[[resources]] # Path: weather.json path = "weather.json" description = """Instructional dataset showing actual and predicted temperature data.""" -[[resources]] # Path: wheat.json +[[resources]] # Path: wheat.json path = "wheat.json" description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published an elegant chart on the price of wheat. It plots 250 years of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that “never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.”""" @@ -678,14 +678,14 @@ description = """In an 1822 letter to Parliament, [William Playfair](https://en. title = "1822 Playfair Chart" path = "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg" -[[resources]] # Path: windvectors.csv +[[resources]] # Path: windvectors.csv path = "windvectors.csv" description = """Simulated wind patterns over northwestern Europe.""" -[[resources]] # Path: world-110m.json +[[resources]] # Path: world-110m.json path = "world-110m.json" -[[resources]] # Path: zipcodes.csv +[[resources]] # Path: zipcodes.csv path = "zipcodes.csv" description = """GeoNames.org""" From 1ea28125fc3e0a0e3973edef04f92866b4b27ed6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:29:21 +0000 Subject: [PATCH 05/40] style: run `taplo fmt -o "align_entries=true" -o "allowed_blank_lines=1"` --- SOURCES.toml | 270 +++++++++++++++++++++++++-------------------------- 1 file changed, 133 insertions(+), 137 deletions(-) diff --git a/SOURCES.toml b/SOURCES.toml index 2a7e41be..eca7bab0 100644 --- a/SOURCES.toml +++ b/SOURCES.toml @@ -1,48 +1,48 @@ [[resources]] # Path: 7zip.png -path = "7zip.png" +path = "7zip.png" description = """Application icons from open-source software projects.""" [[resources]] # Path: airports.csv path = "airports.csv" [[resources]] # Path: annual-precip.json -path = "annual-precip.json" +path = "annual-precip.json" description = """A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.""" [[resources.sources]] title = "Climate Forecast System Version 2" -path = "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2" +path = "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2" [[resources]] # Path: anscombe.json -path = "anscombe.json" +path = "anscombe.json" description = """Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.""" [[resources]] # Path: barley.json -path = "barley.json" +path = "barley.json" description = """The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption.\" R.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments.\" Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.""" [[resources.sources]] title = "The Design of Experiments Reference" -path = "https://en.wikipedia.org/wiki/The_Design_of_Experiments" +path = "https://en.wikipedia.org/wiki/The_Design_of_Experiments" [[resources.sources]] title = "Trellis Charts Paper" -path = "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf" +path = "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf" [[resources]] # Path: birdstrikes.csv -path = "birdstrikes.csv" +path = "birdstrikes.csv" description = "Records of reported wildlife strikes received by the U.S. FAA" [[resources.sources]] title = "FAA Wildlife Strike Database" -path = "http://wildlife.faa.gov" +path = "http://wildlife.faa.gov" [[resources]] # Path: budget.json -path = "budget.json" +path = "budget.json" description = """Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.""" [[resources.sources]] title = "Office of Management and Budget - Budget FY 2016 - Receipts" -path = "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3" +path = "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3" [[resources]] # Path: budgets.json path = "budgets.json" @@ -58,107 +58,106 @@ The caption of the original 1951 [visualization](https://graphicdesignarchives.o [[resources.sources]] title = "Scope Magazine" -path = "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/" +path = "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/" [[resources.sources]] title = "Protovis Antibiotics Example" -path = "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html" +path = "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html" [[resources]] # Path: cars.json -path = "cars.json" +path = "cars.json" description = "Collection of car specifications and performance metrics from various automobile manufacturers." [[resources.sources]] title = "StatLib Datasets Archive" -path = "http://lib.stat.cmu.edu/datasets/" +path = "http://lib.stat.cmu.edu/datasets/" [[resources]] # Path: co2-concentration.csv -path = "co2-concentration.csv" +path = "co2-concentration.csv" description = """Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. Only includes rows with valid data.""" [[resources.sources]] title = "Scripps CO2 Program" -path = "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record" +path = "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record" [[resources]] # Path: countries.json -path = "countries.json" +path = "countries.json" description = """This dataset combines key demographic indicators (life expectancy at birth and fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year intervals. It includes both current values and adjacent time period values (previous and next) for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to \"show people the big picture\" rather than support detailed numeric analysis.""" [resources.schema] [[resources.schema.fields]] -name = "year" +name = "year" description = "Years from 1955 to 2000 at 5-year intervals" [[resources.schema.fields]] -name = "country" +name = "country" description = "Name of the country" [[resources.schema.fields]] -name = "fertility" +name = "fertility" description = "Fertility rate (average number of children per woman) for the given year" [[resources.schema.fields]] -name = "life_expect" +name = "life_expect" description = "Life expectancy in years for the given year" [[resources.schema.fields]] -name = "p_fertility" +name = "p_fertility" description = "Fertility rate for the previous 5-year interval" [[resources.schema.fields]] -name = "n_fertility" +name = "n_fertility" description = "Fertility rate for the next 5-year interval" [[resources.schema.fields]] -name = "p_life_expect" +name = "p_life_expect" description = "Life expectancy for the previous 5-year interval" [[resources.schema.fields]] -name = "n_life_expect" +name = "n_life_expect" description = "Life expectancy for the next 5-year interval" [[resources.sources]] -title = "Gapminder Foundation - Life Expectancy" -path = "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676" +title = "Gapminder Foundation - Life Expectancy" +path = "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676" version = "14" [[resources.sources]] -title = "Gapminder Foundation - Fertility" -path = "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676" +title = "Gapminder Foundation - Fertility" +path = "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676" version = "14" [[resources.licenses]] title = "Creative Commons Attribution 4.0 International" -path = "https://www.gapminder.org/free-material/" - +path = "https://www.gapminder.org/free-material/" [[resources]] # Path: crimea.json path = "crimea.json" [[resources]] # Path: disasters.csv -path = "disasters.csv" +path = "disasters.csv" description = "Annual number of deaths from disasters." [[resources.sources]] title = "Our World in Data - Natural Catastrophes" -path = "https://ourworldindata.org/natural-catastrophes" +path = "https://ourworldindata.org/natural-catastrophes" [[resources]] # Path: driving.json path = "driving.json" [[resources.sources]] title = "New York Times" -path = "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html" +path = "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html" [[resources]] # Path: earthquakes.json -path = "earthquakes.json" +path = "earthquakes.json" description = """Earthquake data retrieved Feb 6, 2018""" [[resources.sources]] title = "USGS Earthquake Feed" -path = "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson" +path = "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson" [[resources]] # Path: ffox.png -path = "ffox.png" +path = "ffox.png" description = """Application icons from open-source software projects.""" [[resources]] # Path: flare-dependencies.json @@ -168,76 +167,76 @@ path = "flare-dependencies.json" path = "flare.json" [[resources]] # Path: flights-10k.json -path = "flights-10k.json" +path = "flights-10k.json" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" -path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" [[resources]] # Path: flights-200k.arrow -path = "flights-200k.arrow" +path = "flights-200k.arrow" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" -path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" [[resources]] # Path: flights-200k.json -path = "flights-200k.json" +path = "flights-200k.json" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" -path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" [[resources]] # Path: flights-20k.json -path = "flights-20k.json" +path = "flights-20k.json" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" -path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" [[resources]] # Path: flights-2k.json -path = "flights-2k.json" +path = "flights-2k.json" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" -path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" [[resources]] # Path: flights-3m.parquet -path = "flights-3m.parquet" +path = "flights-3m.parquet" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" -path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" [[resources]] # Path: flights-5k.json -path = "flights-5k.json" +path = "flights-5k.json" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" -path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" [[resources]] # Path: flights-airport.csv -path = "flights-airport.csv" +path = "flights-airport.csv" description = """Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`""" [[resources.sources]] title = "U.S. Bureau of Transportation Statistics" -path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" +path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" [[resources]] # Path: football.json -path = "football.json" +path = "football.json" description = """Football match outcomes across multiple divisions from 2013 to 2017, part of a larger dataset from OpenFootball. The subset was made such that there are records for all five chosen divisions over the time period.""" [[resources.sources]] title = "OpenFootball" -path = "https://github.com/openfootball/football.json" +path = "https://github.com/openfootball/football.json" [[resources]] # Path: gapminder-health-income.csv path = "gapminder-health-income.csv" @@ -247,14 +246,13 @@ Gapminder (v30, 2023) defines per-capita income as follows: >\"This is real GDP per capita (gross domestic product per person adjusted for inflation) converted to international dollars using purchasing power parity rates. An international dollar has the same purchasing power over GDP as the U.S. dollar has in the United States.\"""" [[resources.sources]] title = "Gapminder Foundation" -path = "https://www.gapminder.org" +path = "https://www.gapminder.org" [[resources.sources]] title = "Gapminder GDP Per Capita Data" -path = "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268" +path = "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268" [[resources.licenses]] title = "Creative Commons Attribution 4.0 International" -path = "https://www.gapminder.org/free-material/" - +path = "https://www.gapminder.org/free-material/" [[resources]] # Path: gapminder.json path = "gapminder.json" @@ -267,96 +265,95 @@ Notes: [resources.schema] [[resources.schema.fields]] -name = "year" +name = "year" description = "Years from 1955 to 2005 at 5-year intervals" [[resources.schema.fields]] -name = "country" +name = "country" description = "Name of the country" [[resources.schema.fields]] -name = "cluster" +name = "cluster" description = "A categorical variable (values 0-5) grouping countries by region" [[resources.schema.fields]] -name = "pop" +name = "pop" description = "Population of the country" [[resources.schema.fields]] -name = "life_expect" +name = "life_expect" description = "Life expectancy in years" [[resources.schema.fields]] -name = "fertility" +name = "fertility" description = "Fertility rate (average number of children per woman" [[resources.sources]] -title = "Gapminder Foundation - Life Expectancy (Data)" -path = "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676" +title = "Gapminder Foundation - Life Expectancy (Data)" +path = "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676" version = "14" [[resources.sources]] title = "Gapminder Foundatio - Life Expectancy (Documentation)" -path = "https://www.gapminder.org/data/documentation/gd004/" +path = "https://www.gapminder.org/data/documentation/gd004/" [[resources.sources]] -title = "Gapminder Foundation - Population (Data)" -path = "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676" +title = "Gapminder Foundation - Population (Data)" +path = "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676" version = "7" [[resources.sources]] title = "Gapminder Foundation - Population (Documentation)" -path = "https://www.gapminder.org/data/documentation/gd003/" +path = "https://www.gapminder.org/data/documentation/gd003/" [[resources.sources]] -title = "Gapminder Foundation - Fertility (Data)" -path = "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676" +title = "Gapminder Foundation - Fertility (Data)" +path = "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676" version = "14" [[resources.sources]] title = "Gapminder Foundation - Fertility Documentation (Documentation)" -path = "https://www.gapminder.org/data/documentation/gd008/" +path = "https://www.gapminder.org/data/documentation/gd008/" [[resources.sources]] -title = "Gapminder Foundation - Data Geographies (Data)" -path = "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158" +title = "Gapminder Foundation - Data Geographies (Data)" +path = "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158" version = "2" [[resources.sources]] title = "Gapminder Foundation - Data Geographies (Documentation)" -path = "https://www.gapminder.org/data/geo/" +path = "https://www.gapminder.org/data/geo/" [[resources.sources]] title = "Gapminder Data Documentation" -path = "https://www.gapminder.org/data/documentation/" - +path = "https://www.gapminder.org/data/documentation/" [[resources]] # Path: gimp.png -path = "gimp.png" +path = "gimp.png" description = """Application icons from open-source software projects.""" [[resources]] # Path: github.csv -path = "github.csv" +path = "github.csv" description = """Generated using `/scripts/github.py`.""" [[resources]] # Path: global-temp.csv -path = "global-temp.csv" +path = "global-temp.csv" description = """Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.""" [[resources.sources]] title = "NASA Goddard Institute for Space Studies" -path = "https://data.giss.nasa.gov/gistemp/" +path = "https://data.giss.nasa.gov/gistemp/" [[resources]] # Path: income.json path = "income.json" [[resources]] # Path: iowa-electricity.csv -path = "iowa-electricity.csv" +path = "iowa-electricity.csv" description = """The state of Iowa has dramatically increased its production of renewable wind power in recent years. This file contains the annual net generation of electricity in the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. It is useful for illustrating stacked area charts.""" [[resources.sources]] title = "U.S. Energy Information Administration" -path = "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin=" +path = "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin=" [[resources]] # Path: jobs.json path = "jobs.json" @@ -380,37 +377,37 @@ When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ip Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0""" [[resources.sources]] -title = "IPUMS USA" -path = "https://usa.ipums.org/usa/" +title = "IPUMS USA" +path = "https://usa.ipums.org/usa/" version = "6.0" [[resources]] # Path: la-riots.csv -path = "la-riots.csv" +path = "la-riots.csv" description = """More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic coordinates of their death. Compiled and published by the Los Angeles Times Data Desk.""" [[resources.sources]] title = "LA Riots Deaths, Los Angeles Times Data Desk" -path = "http://spreadsheets.latimes.com/la-riots-deaths/" +path = "http://spreadsheets.latimes.com/la-riots-deaths/" [[resources]] # Path: londonBoroughs.json -path = "londonBoroughs.json" +path = "londonBoroughs.json" description = """Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. Original data \"contains National Statistics data © Crown copyright and database right (2015)\" and \"Contains Ordnance Survey data © Crown copyright and database right [2015].""" [[resources.sources]] title = "Statistical GIS Boundary Files, London Datastore" -path = "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london" +path = "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london" [[resources]] # Path: londonCentroids.json -path = "londonCentroids.json" +path = "londonCentroids.json" description = """Calculated from `londongBoroughs.json` using `d3.geoCentroid`.""" [[resources]] # Path: londonTubeLines.json -path = "londonTubeLines.json" +path = "londonTubeLines.json" description = """Selected rail lines simplified from source.""" [[resources.sources]] title = "London Tube Data" -path = "https://github.com/oobrien/vis/tree/master/tube/data" +path = "https://github.com/oobrien/vis/tree/master/tube/data" [[resources]] # Path: lookup_groups.csv path = "lookup_groups.csv" @@ -442,15 +439,14 @@ Source data has been verified against the kings & queens and interregnum pages o [[resources.sources]] title = "The Royal Family - Kings & Queens" -path = "https://www.royal.uk/kings-and-queens-1066" +path = "https://www.royal.uk/kings-and-queens-1066" [[resources.sources]] title = "The Royal Family - Interregnum" -path = "https://www.royal.uk/interregnum-1649-1660" - +path = "https://www.royal.uk/interregnum-1649-1660" [[resources]] # Path: movies.json -path = "movies.json" +path = "movies.json" description = """The dataset has well known and intentionally included errors. This dataset is used for instructional purposes, including the need to reckon with dirty data.""" [[resources]] # Path: normal-2d.json @@ -460,31 +456,31 @@ path = "normal-2d.json" path = "obesity.json" [[resources]] # Path: ohlc.json -path = "ohlc.json" +path = "ohlc.json" description = """This dataset contains the performance of the Chicago Board Options Exchange [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX?ltr=1#eyJpbnRlcnZhbCI6ImRheSIsInBlcmlvZGljaXR5IjoxLCJ0aW1lVW5pdCI6bnVsbCwiY2FuZGxlV2lkdGgiOjgsInZvbHVtZVVuZGVybGF5Ijp0cnVlLCJhZGoiOnRydWUsImNyb3NzaGFpciI6dHJ1ZSwiY2hhcnRUeXBlIjoibGluZSIsImV4dGVuZGVkIjpmYWxzZSwibWFya2V0U2Vzc2lvbnMiOnt9LCJhZ2dyZWdhdGlvblR5cGUiOiJvaGxjIiwiY2hhcnRTY2FsZSI6ImxpbmVhciIsInN0dWRpZXMiOnsidm9sIHVuZHIiOnsidHlwZSI6InZvbCB1bmRyIiwiaW5wdXRzIjp7ImlkIjoidm9sIHVuZHIiLCJkaXNwbGF5Ijoidm9sIHVuZHIifSwib3V0cHV0cyI6eyJVcCBWb2x1bWUiOiIjMDBiMDYxIiwiRG93biBWb2x1bWUiOiIjRkYzMzNBIn0sInBhbmVsIjoiY2hhcnQiLCJwYXJhbWV0ZXJzIjp7IndpZHRoRmFjdG9yIjowLjQ1LCJjaGFydE5hbWUiOiJjaGFydCJ9fX0sInBhbmVscyI6eyJjaGFydCI6eyJwZXJjZW50IjoxLCJkaXNwbGF5IjoiXlZJWCIsImNoYXJ0TmFtZSI6ImNoYXJ0IiwidG9wIjowfX0sInNldFNwYW4iOnt9LCJsaW5lV2lkdGgiOjIsInN0cmlwZWRCYWNrZ3JvdWQiOnRydWUsImV2ZW50cyI6dHJ1ZSwiY29sb3IiOiIjMDA4MWYyIiwiZXZlbnRNYXAiOnsiY29ycG9yYXRlIjp7ImRpdnMiOnRydWUsInNwbGl0cyI6dHJ1ZX0sInNpZ0RldiI6e319LCJzeW1ib2xzIjpbeyJzeW1ib2wiOiJeVklYIiwic3ltYm9sT2JqZWN0Ijp7InN5bWJvbCI6Il5WSVgifSwicGVyaW9kaWNpdHkiOjEsImludGVydmFsIjoiZGF5IiwidGltZVVuaXQiOm51bGwsInNldFNwYW4iOnt9fV19)) in the summer of 2009.""" [[resources.sources]] title = "Yahoo Finance VIX Data" -path = "https://finance.yahoo.com/chart/%5EVIX" +path = "https://finance.yahoo.com/chart/%5EVIX" [[resources]] # Path: penguins.json -path = "penguins.json" +path = "penguins.json" description = """Palmer Archipelago (Antarctica) penguin data collected and made available by [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research Network](https://lternet.edu/).""" [[resources.sources]] title = "Palmer Station Antarctica LTER" -path = "https://pal.lternet.edu/" +path = "https://pal.lternet.edu/" [[resources.sources]] title = "Allison Horst's Penguins Repository" -path = "https://github.com/allisonhorst/penguins" +path = "https://github.com/allisonhorst/penguins" [[resources]] # Path: platformer-terrain.json -path = "platformer-terrain.json" +path = "platformer-terrain.json" description = """Assets from the video game Celeste.""" [[resources.sources]] title = "Celeste Game" -path = "http://www.celestegame.com/" +path = "http://www.celestegame.com/" [[resources]] # Path: points.json path = "points.json" @@ -501,10 +497,10 @@ Additionally, the FEC's Github [repository](https://github.com/fecgov/FEC) state > This project is in the public domain within the United States, and we waive worldwide copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/) dedication. Read more on our license page. A few restrictions limit the way you can use FEC data. For example, you can't use contributor lists for commercial purposes or to solicit donations. Learn more on [FEC.gov](https://www.fec.gov/).""" [[resources.sources]] title = "Federal Election Commission Bulk Data" -path = "https://www.fec.gov/data/browse-data/?tab=bulk-data" +path = "https://www.fec.gov/data/browse-data/?tab=bulk-data" [[resources.sources]] title = "OpenFEC API" -path = "https://api.open.fec.gov/developers/" +path = "https://api.open.fec.gov/developers/" [[resources]] # Path: population.json path = "population.json" @@ -520,45 +516,45 @@ When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ip Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0""" [[resources.sources]] title = "IPUMS USA" -path = "https://usa.ipums.org/usa/" +path = "https://usa.ipums.org/usa/" [[resources]] # Path: population_engineers_hurricanes.csv -path = "population_engineers_hurricanes.csv" +path = "population_engineers_hurricanes.csv" description = """Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, [Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)""" [[resources.sources]] title = "Bureau of Labor Statistics" -path = "https://www.bls.gov/oes/tables.htm" +path = "https://www.bls.gov/oes/tables.htm" [[resources.sources]] title = "American Community Survey" -path = "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table" +path = "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table" [[resources.sources]] title = "NOAA National Climatic Data Center" -path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" +path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" [[resources]] # Path: seattle-weather-hourly-normals.csv -path = "seattle-weather-hourly-normals.csv" +path = "seattle-weather-hourly-normals.csv" description = """Hourly weather normals with metric units. The 1981-2010 Climate Normals are NCDC's three-decade averages of climatological variables, including temperature and precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure and updated the format to be easier to parse.""" [[resources.sources]] title = "NOAA National Climatic Data Center (NCDC)" -path = "https://www.ncdc.noaa.gov/cdo-web/datatools/normals" +path = "https://www.ncdc.noaa.gov/cdo-web/datatools/normals" [[resources]] # Path: seattle-weather.csv -path = "seattle-weather.csv" +path = "seattle-weather.csv" description = """Daily weather records with metric units. Transformed using `/scripts/weather.py`. The categorical \"weather\" field is synthesized from multiple fields in the original dataset. This data is intended for instructional purposes.""" [[resources.sources]] title = "NOAA National Climatic Data Center" -path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" +path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" [[resources]] # Path: sp500-2000.csv -path = "sp500-2000.csv" +path = "sp500-2000.csv" description = """S&P 500 index values from 2000 to 2020.""" [[resources.sources]] title = "Yahoo Finance" -path = "https://finance.yahoo.com/quote/%5EDJI/history/" +path = "https://finance.yahoo.com/quote/%5EDJI/history/" [[resources]] # Path: sp500.csv path = "sp500.csv" @@ -586,13 +582,13 @@ The BLS Web site states: See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).""" [[resources.sources]] title = "U.S. Census Bureau Current Population Survey" -path = "https://www.census.gov/programs-surveys/cps.html" +path = "https://www.census.gov/programs-surveys/cps.html" [[resources.sources]] title = "BLS LAUS Data Tools" -path = "https://www.bls.gov/lau/data.htm" +path = "https://www.bls.gov/lau/data.htm" [[resources.sources]] title = "Bureau of Labor Statistics Table A-31" -path = "https://www.bls.gov/web/empsit/cpseea31.htm" +path = "https://www.bls.gov/web/empsit/cpseea31.htm" [[resources]] # Path: unemployment.tsv path = "unemployment.tsv" @@ -615,20 +611,20 @@ When using BLS public data API and datasets, users should adhere to the [BLS Ter [resources.schema] [[resources.schema.fields]] -name = "id" +name = "id" description = "The combined state and county FIPS code" [[resources.schema.fields]] -name = "rate" +name = "rate" description = "The unemployment rate for the county" [[resources.sources]] title = "BLS Developers API" -path = "https://www.bls.gov/developers/" +path = "https://www.bls.gov/developers/" [[resources.sources]] title = "BLS Handbook of Methods" -path = "https://www.bls.gov/opub/hom/lau/home.htm" +path = "https://www.bls.gov/opub/hom/lau/home.htm" [[resources]] # Path: uniform-2d.json path = "uniform-2d.json" @@ -645,50 +641,50 @@ Totals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/t A calculated \"nonfarm_change\" column has been appended with the month-to-month change in that supersector's employment. It is useful for illustrating how to make bar charts that report both negative and positive values.""" [[resources.sources]] title = "U.S. Bureau of Labor Statistics Current Employment Statistics" -path = "https://www.bls.gov/ces/" +path = "https://www.bls.gov/ces/" [[resources]] # Path: us-state-capitals.json path = "us-state-capitals.json" [[resources]] # Path: volcano.json -path = "volcano.json" +path = "volcano.json" description = """Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.""" [[resources.sources]] title = "R Datasets" -path = "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html" +path = "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html" [[resources]] # Path: weather.csv -path = "weather.csv" +path = "weather.csv" description = """NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized from multiple fields in the original dataset. This data is intended for instructional purposes.""" [[resources.sources]] title = "NOAA Climate Data Online" -path = "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation" +path = "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation" [[resources]] # Path: weather.json -path = "weather.json" +path = "weather.json" description = """Instructional dataset showing actual and predicted temperature data.""" [[resources]] # Path: wheat.json -path = "wheat.json" +path = "wheat.json" description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published an elegant chart on the price of wheat. It plots 250 years of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that “never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.”""" [[resources.sources]] title = "1822 Playfair Chart" -path = "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg" +path = "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg" [[resources]] # Path: windvectors.csv -path = "windvectors.csv" +path = "windvectors.csv" description = """Simulated wind patterns over northwestern Europe.""" [[resources]] # Path: world-110m.json path = "world-110m.json" [[resources]] # Path: zipcodes.csv -path = "zipcodes.csv" +path = "zipcodes.csv" description = """GeoNames.org""" [[resources.sources]] title = "GeoNames" -path = "https://www.geonames.org" +path = "https://www.geonames.org" From 4ff32cc5debf78346aea9c4a2f506790bbeb6444 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:15:43 +0000 Subject: [PATCH 06/40] style: Use an inline table for `unemployment.tsv` schema The other schemas have much more content, so leaving those as-is --- SOURCES.toml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/SOURCES.toml b/SOURCES.toml index eca7bab0..d1098bbe 100644 --- a/SOURCES.toml +++ b/SOURCES.toml @@ -610,14 +610,10 @@ For the most up-to-date LAUS data: When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).""" [resources.schema] -[[resources.schema.fields]] -name = "id" -description = "The combined state and county FIPS code" - -[[resources.schema.fields]] -name = "rate" -description = "The unemployment rate for the county" - +fields = [ + { name = "id", description = "The combined state and county FIPS code" }, + { name = "rate", description = "The unemployment rate for the county" }, +] [[resources.sources]] title = "BLS Developers API" path = "https://www.bls.gov/developers/" From 56e01e92f5554baa6c745d72088aa42c29ceb944 Mon Sep 17 00:00:00 2001 From: Daniel Sorid <63077097+dsmedia@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:24:49 -0500 Subject: [PATCH 07/40] improve dataset description readability Co-authored-by: Dan Redding <125183946+dangotbanned@users.noreply.github.com> --- SOURCES.toml | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/SOURCES.toml b/SOURCES.toml index d1098bbe..d1928bb1 100644 --- a/SOURCES.toml +++ b/SOURCES.toml @@ -49,12 +49,23 @@ path = "budgets.json" [[resources]] # Path: burtin.json path = "burtin.json" -description = """The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. The dataset compares the performance of three antibiotics against 16 different bacteria. The numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. The dataset was featured as an example in the Protovis project, a precursor to D3.js. The Protovis example notes that, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin.\" The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. +description = """The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. +The dataset compares the performance of three antibiotics against 16 different bacteria. +The numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. +The dataset was featured as an example in the Protovis project, a precursor to D3.js. +The Protovis example notes that, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin\". +The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) reads as follows: > ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin - > -> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin. The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits the test organism. High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness. It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis. Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin. It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.""" +> +> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin. +> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits the test organism. +> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness. +> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis. +> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin. +> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood. +""" [[resources.sources]] title = "Scope Magazine" From 5f44926e93ff01be0db857cb6e7ffd7929e4884f Mon Sep 17 00:00:00 2001 From: Daniel Sorid <63077097+dsmedia@users.noreply.github.com> Date: Tue, 10 Dec 2024 17:25:32 -0500 Subject: [PATCH 08/40] improve dataset description readability Co-authored-by: Dan Redding <125183946+dangotbanned@users.noreply.github.com> --- SOURCES.toml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/SOURCES.toml b/SOURCES.toml index d1928bb1..5dbbbd3e 100644 --- a/SOURCES.toml +++ b/SOURCES.toml @@ -18,8 +18,12 @@ path = "anscombe.json" description = """Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.""" [[resources]] # Path: barley.json -path = "barley.json" -description = """The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption.\" R.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments.\" Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.""" +path = "barley.json" +description = """The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. +It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\". +R.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\". +Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s. +""" [[resources.sources]] title = "The Design of Experiments Reference" From eadf7c52d218ded5d92e61302cdc84623db5c9ee Mon Sep 17 00:00:00 2001 From: ds <63077097+dsmedia@users.noreply.github.com> Date: Wed, 11 Dec 2024 06:39:03 -0500 Subject: [PATCH 09/40] Reduce line length; move columns to table schema --- SOURCES.toml | 453 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 353 insertions(+), 100 deletions(-) diff --git a/SOURCES.toml b/SOURCES.toml index 5dbbbd3e..ab89c191 100644 --- a/SOURCES.toml +++ b/SOURCES.toml @@ -19,10 +19,18 @@ description = """Graphs in Statistical Analysis, F. J. Anscombe, The American St [[resources]] # Path: barley.json path = "barley.json" -description = """The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. -It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\". -R.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\". -Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s. +description = """ +The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 \ +different varieties of barley at six different sites. + +It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \ +"Statistical Determination of Barley Varietal Adaption". + +R.A. Fisher's popularized its use in the field of statistics when he included it in his book "The \ +Design of Experiments". + +Since then it has been used to demonstrate new statistical techniques, including the trellis charts \ +developed by Richard Becker, William Cleveland and others in the 1990s. """ [[resources.sources]] @@ -53,22 +61,49 @@ path = "budgets.json" [[resources]] # Path: burtin.json path = "burtin.json" -description = """The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. +description = """ +The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic \ +effectiveness, originally published in Scope Magazine. + The dataset compares the performance of three antibiotics against 16 different bacteria. -The numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. + +Numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each \ +antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic \ +effectiveness. + The dataset was featured as an example in the Protovis project, a precursor to D3.js. -The Protovis example notes that, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin\". -The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. -The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) reads as follows: + +As noted in the Protovis example, "Recreating this display revealed some minor errors in the original: \ +a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin". + +The vega-datsets version is largely consistent with the Protovis version of the dataset, with one \ +correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a \ +new column, 'Genus', to group related bacterial species together. + +The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/\ +wmgda_8616c.jpg) reads as follows: + > ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin > > -> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin. -> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits the test organism. -> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness. -> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis. -> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin. -> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood. +> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in \ +> red and gram- in blue) with their sensitivities to penicillin, and streptomycin. +> +> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits \ +> the test organism. +> +> High dilutions are toward the periphery; consequently the length of the colored bar is proportional \ +> to the effectiveness. +> +> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. \ +> fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. \ +> vulgaris, S. schottmuelleri and M. tuberculosis. +> +> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to \ +> neomycin, although the majority of these are sensitive to neomycin. +> +> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is \ +> not understood. """ [[resources.sources]] @@ -88,7 +123,8 @@ path = "http://lib.stat.cmu.edu/datasets/" [[resources]] # Path: co2-concentration.csv path = "co2-concentration.csv" -description = """Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. Only includes rows with valid data.""" +description = """Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. +Only includes rows with valid data.""" [[resources.sources]] title = "Scripps CO2 Program" @@ -96,7 +132,13 @@ path = "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record [[resources]] # Path: countries.json path = "countries.json" -description = """This dataset combines key demographic indicators (life expectancy at birth and fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year intervals. It includes both current values and adjacent time period values (previous and next) for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to \"show people the big picture\" rather than support detailed numeric analysis.""" +description = """This dataset combines key demographic indicators (life expectancy at birth and +fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year +intervals. It includes both current values and adjacent time period values (previous and next) +for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/ +documentation/) notes that its philosophy is to fill data gaps with estimates and use current +geographic boundaries for historical data. Gapminder states that it aims to "show people the +big picture" rather than support detailed numeric analysis.""" [resources.schema] [[resources.schema.fields]] @@ -247,7 +289,9 @@ path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146 [[resources]] # Path: football.json path = "football.json" -description = """Football match outcomes across multiple divisions from 2013 to 2017, part of a larger dataset from OpenFootball. The subset was made such that there are records for all five chosen divisions over the time period.""" +description = """Football match outcomes across multiple divisions from 2013 to 2017, part of a +larger dataset from OpenFootball. The subset was made such that there are records for all five +chosen divisions over the time period.""" [[resources.sources]] title = "OpenFootball" @@ -255,10 +299,15 @@ path = "https://github.com/openfootball/football.json" [[resources]] # Path: gapminder-health-income.csv path = "gapminder-health-income.csv" -description = """Per-capita income, life expectancy, population and regional grouping. Dataset does not specify the reference year for the data. Gapminder historical data is subject to revisions. +description = """ +Per-capita income, life expectancy, population and regional grouping. Dataset does not specify +the reference year for the data. Gapminder historical data is subject to revisions. Gapminder (v30, 2023) defines per-capita income as follows: ->\"This is real GDP per capita (gross domestic product per person adjusted for inflation) converted to international dollars using purchasing power parity rates. An international dollar has the same purchasing power over GDP as the U.S. dollar has in the United States.\"""" +>"This is real GDP per capita (gross domestic product per person adjusted for inflation) +>converted to international dollars using purchasing power parity rates. An international dollar +>has the same purchasing power over GDP as the U.S. dollar has in the United States." +""" [[resources.sources]] title = "Gapminder Foundation" path = "https://www.gapminder.org" @@ -271,12 +320,29 @@ path = "https://www.gapminder.org/free-material/" [[resources]] # Path: gapminder.json path = "gapminder.json" -description = """This dataset combines key demographic indicators (life expectancy at birth, population, and fertility rate measured as babies per woman) for various countries from 1955 to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable grouping countries. Gapminder's data documentation notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to "show people the big picture" rather than support detailed numeric analysis. +description = """This dataset combines key demographic indicators (life expectancy at birth, +population, and fertility rate measured as babies per woman) for various countries from 1955 +to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable +grouping countries. Gapminder's data documentation notes that its philosophy is to fill data +gaps with estimates and use current geographic boundaries for historical data. Gapminder +states that it aims to "show people the big picture" rather than support detailed numeric +analysis. Notes: -1. Country Selection: The set of countries in this file matches the version of this dataset originally added to this collection in 2015. The specific criteria for country selection in that version are not known. Data for Aruba are no longer available in the new version. Hong Kong has been revised to Hong Kong, China in the new version. -2. Data Precision: The precision of float values may have changed from the original version. These changes reflect the most recent source data used for each indicator. -3. Regional Groupings: The 'cluster' column represents a regional mapping of countries corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To preserve continuity with previous versions of this dataset, we have retained the column name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.""" +1. Country Selection: The set of countries in this file matches the version of this dataset + originally added to this collection in 2015. The specific criteria for country selection + in that version are not known. Data for Aruba are no longer available in the new version. + Hong Kong has been revised to Hong Kong, China in the new version. + +2. Data Precision: The precision of float values may have changed from the original version. + These changes reflect the most recent source data used for each indicator. + +3. Regional Groupings: The 'cluster' column represents a regional mapping of countries + corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To + preserve continuity with previous versions of this dataset, we have retained the column + name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: + `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, + 4: east_asia_pacific, 5: middle_east_north_africa`.""" [resources.schema] [[resources.schema.fields]] @@ -364,7 +430,10 @@ path = "income.json" [[resources]] # Path: iowa-electricity.csv path = "iowa-electricity.csv" -description = """The state of Iowa has dramatically increased its production of renewable wind power in recent years. This file contains the annual net generation of electricity in the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. It is useful for illustrating stacked area charts.""" +description = """The state of Iowa has dramatically increased its production of renewable +wind power in recent years. This file contains the annual net generation of electricity in +the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. +It is useful for illustrating stacked area charts.""" [[resources.sources]] title = "U.S. Energy Information Administration" @@ -372,24 +441,66 @@ path = "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&f [[resources]] # Path: jobs.json path = "jobs.json" -description = """U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790. -Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). -Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). -The dataset is structured as follows: - -- job: The occupation title -- sex: Sex (men/women) -- year: Census year -- count: Number of individuals in the occupation -- perc: Percentage of the workforce in the occupation -IPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating: ->We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared. +description = """ +U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by \ +sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \ +"collects, preserves and harmonizes U.S. census microdata" from as early as 1790. + +Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, \ +Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/\ +bdata_ch12.pdf). The dataset is also referenced in this vega [example](https://vega.github.io/vega/\ +examples/job-voyager/). + +Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) \ +variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of \ +IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to \ +occupation coding since version 6, particularly for 19th-century samples, which may result in \ +discrepancies between this dataset and current IPUMS data. Details on data revisions are available \ +[here](https://usa.ipums.org/usa-action/revisions). + +IPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, \ +stating: +>We're excited to hear that this dataset made its way to this repository and is being used by students \ +for data visualization. We allow for these types of redistributions of summary data so long as the \ +underlying microdata records are not shared. + This dataset contains only summary statistics and does not include any underlying microdata records. + 1. This dataset represents summary data. The underlying microdata records are not included. -2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) (person weight) variable as an expansion factor when working with IPUMS USA extracts. -3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly. -When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). The organization requests use of the following citation for this json file: -Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0""" +2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/\ +usa-action/variables/PERWT#description_section) (person weight) variable as an expansion factor when \ +working with IPUMS USA extracts. +3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current \ +IPUMS USA data exactly. + +When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/\ +terms.shtml). The organization requests use of the following citation for this json file: + +Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use \ +Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/\ +D010.V6.0 +""" + +[resources.schema] +[[resources.schema.fields]] +name = "job" +description = "The occupation title" + +[[resources.schema.fields]] +name = "sex" +description = "Sex (men/women)" + +[[resources.schema.fields]] +name = "year" +description = "Census year" + +[[resources.schema.fields]] +name = "count" +description = "Number of individuals in the occupation" + +[[resources.schema.fields]] +name = "perc" +description = "Percentage of the workforce in the occupation" [[resources.sources]] title = "IPUMS USA" @@ -398,7 +509,9 @@ version = "6.0" [[resources]] # Path: la-riots.csv path = "la-riots.csv" -description = """More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic coordinates of their death. Compiled and published by the Los Angeles Times Data Desk.""" +description = """More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles +for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic +coordinates of their death. Compiled and published by the Los Angeles Times Data Desk.""" [[resources.sources]] title = "LA Riots Deaths, Los Angeles Times Data Desk" @@ -406,7 +519,9 @@ path = "http://spreadsheets.latimes.com/la-riots-deaths/" [[resources]] # Path: londonBoroughs.json path = "londonBoroughs.json" -description = """Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. Original data \"contains National Statistics data © Crown copyright and database right (2015)\" and \"Contains Ordnance Survey data © Crown copyright and database right [2015].""" +description = """Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. +Original data \"contains National Statistics data © Crown copyright and database right (2015)\" +and \"Contains Ordnance Survey data © Crown copyright and database right [2015].""" [[resources.sources]] title = "Statistical GIS Boundary Files, London Datastore" @@ -438,19 +553,43 @@ path = "monarchs.json" description = """A chronological list of English and British monarchs from Elizabeth I through George IV. Each entry includes: -- `name`: The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum) -- `start`: The year their rule began. -- `end`: The year their rule ended -- `index`: A [zero-based sequential number](https://en.wikipedia.org/wiki/Zero-based_numbering) assigned to each entry, representing the chronological order of rulers -- `commonwealth`: A Boolean flag (true) for the period from 1649 to 1660. This field is omitted for all other entries. -The dataset contains two intentional inaccuracies to maintain compatibility with the [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization: +The dataset contains two intentional inaccuracies to maintain compatibility with +the [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization: 1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558; 2. the end date for the reign of George IV is shown as 1820, instead of 1830. These discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization. -The entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, the official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702. -The `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, and the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` from the original dataset is retained for backwards compatibility. +The entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, +the official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702. +The `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, +and the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` +from the original dataset is retained for backwards compatibility. The dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689). -Source data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024). Content on the site is protected by Crown Copyright. Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).""" +Source data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024). +Content on the site is protected by Crown Copyright. +Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most +Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).""" + +[resources.schema] + +[[resources.schema.fields]] +name = "name" +description = "The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum)" + +[[resources.schema.fields]] +name = "start" +description = "The year their rule began" + +[[resources.schema.fields]] +name = "end" +description = "The year their rule ended" + +[[resources.schema.fields]] +name = "index" +description = "A zero-based sequential number assigned to each entry, representing the chronological order of rulers" + +[[resources.schema.fields]] +name = "commonwealth" +description = "A Boolean flag (true) for the period from 1649 to 1660. This field is omitted for all other entries" [[resources.sources]] title = "The Royal Family - Kings & Queens" @@ -462,7 +601,8 @@ path = "https://www.royal.uk/interregnum-1649-1660" [[resources]] # Path: movies.json path = "movies.json" -description = """The dataset has well known and intentionally included errors. This dataset is used for instructional purposes, including the need to reckon with dirty data.""" +description = """The dataset has well known and intentionally included errors. +This dataset is provided for instructional purposes, including the need to reckon with dirty data.""" [[resources]] # Path: normal-2d.json path = "normal-2d.json" @@ -472,7 +612,9 @@ path = "obesity.json" [[resources]] # Path: ohlc.json path = "ohlc.json" -description = """This dataset contains the performance of the Chicago Board Options Exchange [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX?ltr=1#eyJpbnRlcnZhbCI6ImRheSIsInBlcmlvZGljaXR5IjoxLCJ0aW1lVW5pdCI6bnVsbCwiY2FuZGxlV2lkdGgiOjgsInZvbHVtZVVuZGVybGF5Ijp0cnVlLCJhZGoiOnRydWUsImNyb3NzaGFpciI6dHJ1ZSwiY2hhcnRUeXBlIjoibGluZSIsImV4dGVuZGVkIjpmYWxzZSwibWFya2V0U2Vzc2lvbnMiOnt9LCJhZ2dyZWdhdGlvblR5cGUiOiJvaGxjIiwiY2hhcnRTY2FsZSI6ImxpbmVhciIsInN0dWRpZXMiOnsidm9sIHVuZHIiOnsidHlwZSI6InZvbCB1bmRyIiwiaW5wdXRzIjp7ImlkIjoidm9sIHVuZHIiLCJkaXNwbGF5Ijoidm9sIHVuZHIifSwib3V0cHV0cyI6eyJVcCBWb2x1bWUiOiIjMDBiMDYxIiwiRG93biBWb2x1bWUiOiIjRkYzMzNBIn0sInBhbmVsIjoiY2hhcnQiLCJwYXJhbWV0ZXJzIjp7IndpZHRoRmFjdG9yIjowLjQ1LCJjaGFydE5hbWUiOiJjaGFydCJ9fX0sInBhbmVscyI6eyJjaGFydCI6eyJwZXJjZW50IjoxLCJkaXNwbGF5IjoiXlZJWCIsImNoYXJ0TmFtZSI6ImNoYXJ0IiwidG9wIjowfX0sInNldFNwYW4iOnt9LCJsaW5lV2lkdGgiOjIsInN0cmlwZWRCYWNrZ3JvdWQiOnRydWUsImV2ZW50cyI6dHJ1ZSwiY29sb3IiOiIjMDA4MWYyIiwiZXZlbnRNYXAiOnsiY29ycG9yYXRlIjp7ImRpdnMiOnRydWUsInNwbGl0cyI6dHJ1ZX0sInNpZ0RldiI6e319LCJzeW1ib2xzIjpbeyJzeW1ib2wiOiJeVklYIiwic3ltYm9sT2JqZWN0Ijp7InN5bWJvbCI6Il5WSVgifSwicGVyaW9kaWNpdHkiOjEsImludGVydmFsIjoiZGF5IiwidGltZVVuaXQiOm51bGwsInNldFNwYW4iOnt9fV19)) in the summer of 2009.""" +description = """This dataset contains the performance of the Chicago Board Options Exchange +[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/ +%5EVIX#overview)) in the summer of 2009.""" [[resources.sources]] title = "Yahoo Finance VIX Data" @@ -480,7 +622,10 @@ path = "https://finance.yahoo.com/chart/%5EVIX" [[resources]] # Path: penguins.json path = "penguins.json" -description = """Palmer Archipelago (Antarctica) penguin data collected and made available by [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research Network](https://lternet.edu/).""" +description = """Palmer Archipelago (Antarctica) penguin data collected and made available by +[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) +and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research +Network](https://lternet.edu/).""" [[resources.sources]] title = "Palmer Station Antarctica LTER" @@ -502,14 +647,27 @@ path = "points.json" [[resources]] # Path: political-contributions.json path = "political-contributions.json" -description = """Summary financial information on contributions to candidates for U.S. elections. An updated version of this datset is available from the \"all candidates\" files (in pipe-delimited format) on the bulk data download page of the U.S. Federal Election Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. -FEC data is subject to the commission's: +description = """Summary financial information on contributions to candidates for U.S. +elections. An updated version of this datset is available from the "all candidates" files +(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election +Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is +available from the [FEC All Candidates File Description](https://www.fec.gov/ +campaign-finance-data/all-candidates-file-description/). The sample dataset in +`political-contributions.json` contains 58 records with dates from 2015. +FEC data is subject to the commission's: - [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/) - [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/) - [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md) + Additionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states: -> This project is in the public domain within the United States, and we waive worldwide copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/) dedication. Read more on our license page. A few restrictions limit the way you can use FEC data. For example, you can't use contributor lists for commercial purposes or to solicit donations. Learn more on [FEC.gov](https://www.fec.gov/).""" +> This project is in the public domain within the United States, and we waive worldwide +> copyright and related rights through [CC0 universal public domain](https:// +> creativecommons.org/publicdomain/zero/1.0/) dedication. Read more on our license page. +> A few restrictions limit the way you can use FEC data. For example, you can't use +> contributor lists for commercial purposes or to solicit donations. Learn more on +> [FEC.gov](https://www.fec.gov/).""" + [[resources.sources]] title = "Federal Election Commission Bulk Data" path = "https://www.fec.gov/data/browse-data/?tab=bulk-data" @@ -519,23 +677,46 @@ path = "https://api.open.fec.gov/developers/" [[resources]] # Path: population.json path = "population.json" -description = """United States population statistics by sex and age group across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790. -The dataset is structured as follows: +description = """ +United States population statistics by sex and age group across decades between 1850 and 2000. +The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census +microdata" from as early as 1790. + +IPUMS updates and revises datasets over time, which may result in discrepancies between this +dataset and current IPUMS data. Details on data revisions are available here. + +When using this dataset, please refer to IPUMS USA terms of use. The organization requests the +use of the following citation for this json file: +Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated +Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. +http://doi.org/10.18128/D010.V6.0 +""" + +[resources.schema] +[[resources.schema.fields]] +name = "year" +description = "Four-digit year of the survey" + +[[resources.schema.fields]] +name = "age" +description = "Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+)" + +[[resources.schema.fields]] +name = "sex" +description = "Sex (1=men, 2=women)" + +[[resources.schema.fields]] +name = "people" +description = "Number of individuals (IPUMS PERWT)" -- year: four-digit year of the survey. - [IPUMS description](https://usa.ipums.org/usa-action/variables/YEAR#description_section) -- age: age group in 5-year intervals (0 represents ages 0-4, 5 represents 5-9, 10 represents 10-14, etc., up to 90 representing 90 and above) - [IPUMS description](https://usa.ipums.org/usa-action/variables/AGE#description_section) -- sex: Sex (men = 1 / women = 2) - [IPUMS description](https://usa.ipums.org/usa-action/variables/SEX#description_section) -- people: Number of individuals, equivalent to IPUMS variable name [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section). -IPUMS updates and revises datasets over time, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). -When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). The organization requests the use of the following citation for this json file: -Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0""" [[resources.sources]] title = "IPUMS USA" path = "https://usa.ipums.org/usa/" [[resources]] # Path: population_engineers_hurricanes.csv path = "population_engineers_hurricanes.csv" -description = """Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, [Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)""" +description = """Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, +[Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)""" [[resources.sources]] title = "Bureau of Labor Statistics" @@ -549,7 +730,11 @@ path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" [[resources]] # Path: seattle-weather-hourly-normals.csv path = "seattle-weather-hourly-normals.csv" -description = """Hourly weather normals with metric units. The 1981-2010 Climate Normals are NCDC's three-decade averages of climatological variables, including temperature and precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure and updated the format to be easier to parse.""" +description = """Hourly weather normals with metric units. The 1981-2010 Climate Normals are +NCDC's three-decade averages of climatological variables, including temperature and +precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/ +documentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure +and updated the format to be easier to parse.""" [[resources.sources]] title = "NOAA National Climatic Data Center (NCDC)" @@ -557,7 +742,9 @@ path = "https://www.ncdc.noaa.gov/cdo-web/datatools/normals" [[resources]] # Path: seattle-weather.csv path = "seattle-weather.csv" -description = """Daily weather records with metric units. Transformed using `/scripts/weather.py`. The categorical \"weather\" field is synthesized from multiple fields in the original dataset. This data is intended for instructional purposes.""" +description = """Daily weather records with metric units. Transformed using `/scripts/weather.py`. +The categorical \"weather\" field is synthesized from multiple fields in the original dataset. +This data is intended for instructional purposes.""" [[resources.sources]] title = "NOAA National Climatic Data Center" @@ -582,19 +769,49 @@ path = "udistrict.json" [[resources]] # Path: unemployment-across-industries.json path = "unemployment-across-industries.json" -description = """Industry-level unemployment statistics from the Current Population Survey (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons and unemployment rate across 11 private industries, as well as agricultural, government, and self-employed workers. Covers January 2000 through February 2010. Industry classification follows format of CPS Table A-31. -Each entry in the JSON file contains: - -- `series`: Industry name -- `year`: Year (2000-2010) -- `month`: Month (1-12) -- `count`: Number of unemployed persons (in thousands) -- `rate`: Unemployment rate (percentage) -- `date`: [ISO 8601](https://www.iso.org/iso-8601-date-and-time-format.html)-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\") -The dataset can be replicated using the BLS API. For more, see the `scripts` folder of this repository. +description = """Industry-level unemployment statistics from the Current Population Survey +(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons +and unemployment rate across 11 private industries, as well as agricultural, government, and +self-employed workers. Covers January 2000 through February 2010. Industry classification +follows format of CPS Table A-31. + +The dataset can be replicated using the BLS API. For more, see the `scripts` folder of this +repository. + The BLS Web site states: -> \"Users of the public API should cite the date that data were accessed or retrieved using the API. Users must clearly state that “BLS.gov cannot vouch for the data or analyses derived from these data after the data have been retrieved from BLS.gov.” The BLS.gov logo may not be used by persons who are not BLS employees or on products (including web pages) that are not BLS-sponsored.\" +> "Users of the public API should cite the date that data were accessed or retrieved using +> the API. Users must clearly state that "BLS.gov cannot vouch for the data or analyses +> derived from these data after the data have been retrieved from BLS.gov." The BLS.gov logo +> may not be used by persons who are not BLS employees or on products (including web pages) +> that are not BLS-sponsored." + See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).""" + +[resources.schema] +[[resources.schema.fields]] +name = "series" +description = "Industry name" + +[[resources.schema.fields]] +name = "year" +description = "Year (2000-2010)" + +[[resources.schema.fields]] +name = "month" +description = "Month (1-12)" + +[[resources.schema.fields]] +name = "count" +description = "Number of unemployed persons (in thousands)" + +[[resources.schema.fields]] +name = "rate" +description = "Unemployment rate (percentage)" + +[[resources.schema.fields]] +name = "date" +description = "ISO 8601-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\")" + [[resources.sources]] title = "U.S. Census Bureau Current Population Survey" path = "https://www.census.gov/programs-surveys/cps.html" @@ -607,16 +824,26 @@ path = "https://www.bls.gov/web/empsit/cpseea31.htm" [[resources]] # Path: unemployment.tsv path = "unemployment.tsv" -description = """This dataset contains county-level unemployment rates in the United States, with data generally consistent with levels reported in 2009. The dataset is structured as tab-separated values. - -The unemployment rate represents the number of unemployed persons as a percentage of the labor force. According to the [Bureau of Labor Statistics (BLS) glossary](https://www.bls.gov/opub/hom/glossary.htm#U): -> Unemployed persons (Current Population Survey) [are] persons aged 16 years and older who had no employment during the reference week, were available for work, except for temporary illness, and had made specific efforts to find employment sometime during the 4-week period ending with the reference week. Persons who were waiting to be recalled to a job from which they had been laid off need not have been looking for work to be classified as unemployed. - -This dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, a federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). The LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions, states, counties, metropolitan areas, and many cities and towns. +description = """This dataset contains county-level unemployment rates in the United States, with data generally +consistent with levels reported in 2009. The dataset is structured as tab-separated values. +The unemployment rate represents the number of unemployed persons as a percentage of the labor +force. According to the Bureau of Labor Statistics (BLS) glossary: + +Unemployed persons (Current Population Survey) [are] persons aged 16 years and older who had +no employment during the reference week, were available for work, except for temporary +illness, and had made specific efforts to find employment sometime during the 4-week period +ending with the reference week. Persons who were waiting to be recalled to a job from which +they had been laid off need not have been looking for work to be classified as unemployed. + +This dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, +a federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). +The LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions, +states, counties, metropolitan areas, and many cities and towns. For the most up-to-date LAUS data: 1. **Monthly and Annual Data Downloads**: -- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) and [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data. +- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) +and [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data. 2. **BLS Public Data API**: - The BLS provides an API for developers to access various datasets, including LAUS data. - To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query. @@ -625,10 +852,15 @@ For the most up-to-date LAUS data: When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).""" [resources.schema] -fields = [ - { name = "id", description = "The combined state and county FIPS code" }, - { name = "rate", description = "The unemployment rate for the county" }, -] + +[[resources.schema.fields]] +name = "id" +description = "The combined state and county FIPS code" + +[[resources.schema.fields]] +name = "rate" +description = "The unemployment rate for the county" + [[resources.sources]] title = "BLS Developers API" path = "https://www.bls.gov/developers/" @@ -645,11 +877,24 @@ path = "us-10m.json" [[resources]] # Path: us-employment.csv path = "us-employment.csv" -description = """In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job losses across the United States. The downturn in employment, and the slow recovery in hiring that followed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau of Labor Statistics. -This file contains the monthly employment total in a variety of job categories from January 2006 through December 2015. The numbers are seasonally adjusted and reported in thousands. The data were downloaded on Nov. 11, 2018, and reformatted for use in this library. - -Totals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector) tracked by the BLS. The \"nonfarm\" total is the category typically used by economists and journalists as a stand-in for the country's employment total. -A calculated \"nonfarm_change\" column has been appended with the month-to-month change in that supersector's employment. It is useful for illustrating how to make bar charts that report both negative and positive values.""" +description = """ +In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job +losses across the United States. The downturn in employment, and the slow recovery in hiring that +followed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau +of Labor Statistics. + +This file contains the monthly employment total in a variety of job categories from January 2006 +through December 2015. The numbers are seasonally adjusted and reported in thousands. The data +were downloaded on Nov. 11, 2018, and reformatted for use in this library. + +Totals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ +ce.supersector) tracked by the BLS. The \"nonfarm\" total is the category typically used by +economists and journalists as a stand-in for the country's employment total. + +A calculated \"nonfarm_change\" column has been appended with the month-to-month change in that +supersector's employment. It is useful for illustrating how to make bar charts that report both +negative and positive values. +""" [[resources.sources]] title = "U.S. Bureau of Labor Statistics Current Employment Statistics" path = "https://www.bls.gov/ces/" @@ -659,7 +904,9 @@ path = "us-state-capitals.json" [[resources]] # Path: volcano.json path = "volcano.json" -description = """Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.""" +description = """Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. +This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a +topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.""" [[resources.sources]] title = "R Datasets" @@ -667,7 +914,8 @@ path = "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.h [[resources]] # Path: weather.csv path = "weather.csv" -description = """NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized from multiple fields in the original dataset. This data is intended for instructional purposes.""" +description = """NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized +from multiple fields in the original dataset. This data is intended for instructional purposes.""" [[resources.sources]] title = "NOAA Climate Data Online" @@ -679,7 +927,12 @@ description = """Instructional dataset showing actual and predicted temperature [[resources]] # Path: wheat.json path = "wheat.json" -description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published an elegant chart on the price of wheat. It plots 250 years of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that “never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.”""" +description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/ +wiki/William_Playfair), a Scottish engineer who is often credited as the founder of +statistical graphics, published an elegant chart on the price of wheat. It plots 250 years +of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that +"never at any former period was wheat so cheap, in proportion to mechanical labour, as it +is at the present time."""" [[resources.sources]] title = "1822 Playfair Chart" From 1ff024c5225e409d17f8e8435a0b1af6b71182d2 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:46:02 +0000 Subject: [PATCH 10/40] feat: Integrate `SOURCE.toml`with `datapackage.json` Rebuilds using content as of (https://github.com/vega/vega-datasets/pull/643/commits/eadf7c52d218ded5d92e61302cdc84623db5c9ee) --- datapackage.json | 523 ++++++++++++++++++++++++++++++++--- scripts/build_datapackage.py | 103 ++++++- 2 files changed, 583 insertions(+), 43 deletions(-) diff --git a/datapackage.json b/datapackage.json index 09d849d6..6379224c 100644 --- a/datapackage.json +++ b/datapackage.json @@ -21,11 +21,12 @@ } ], "version": "2.11.0", - "created": "2024-12-06T16:14:38.044099+00:00", + "created": "2024-12-11T16:26:59.665471+00:00", "resources": [ { "name": "7zip.png", "type": "file", + "description": "Application icons from open-source software projects.", "path": "7zip.png", "scheme": "file", "format": "png", @@ -76,6 +77,13 @@ { "name": "annual-precip.json", "type": "json", + "description": "A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell.", + "sources": [ + { + "title": "Climate Forecast System Version 2", + "path": "https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2" + } + ], "path": "annual-precip.json", "scheme": "file", "format": "json", @@ -85,6 +93,7 @@ { "name": "anscombe.json", "type": "table", + "description": "Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician.", "path": "anscombe.json", "scheme": "file", "format": "json", @@ -115,6 +124,17 @@ { "name": "barley.json", "type": "table", + "description": "The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites.\n\nIt was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper \"Statistical Determination of Barley Varietal Adaption\".\n\nR.A. Fisher's popularized its use in the field of statistics when he included it in his book \"The Design of Experiments\".\n\nSince then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s.\n", + "sources": [ + { + "title": "The Design of Experiments Reference", + "path": "https://en.wikipedia.org/wiki/The_Design_of_Experiments" + }, + { + "title": "Trellis Charts Paper", + "path": "http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf" + } + ], "path": "barley.json", "scheme": "file", "format": "json", @@ -149,6 +169,13 @@ { "name": "birdstrikes.csv", "type": "table", + "description": "Records of reported wildlife strikes received by the U.S. FAA", + "sources": [ + { + "title": "FAA Wildlife Strike Database", + "path": "http://wildlife.faa.gov" + } + ], "path": "birdstrikes.csv", "scheme": "file", "format": "csv", @@ -218,6 +245,13 @@ { "name": "budget.json", "type": "table", + "description": "Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget.", + "sources": [ + { + "title": "Office of Management and Budget - Budget FY 2016 - Receipts", + "path": "https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3" + } + ], "path": "budget.json", "scheme": "file", "format": "json", @@ -554,6 +588,17 @@ { "name": "burtin.json", "type": "table", + "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) reads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in > red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits > the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional > to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. > fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. > vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to > neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood.\n", + "sources": [ + { + "title": "Scope Magazine", + "path": "https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/" + }, + { + "title": "Protovis Antibiotics Example", + "path": "https://mbostock.github.io/protovis/ex/antibiotics-burtin.html" + } + ], "path": "burtin.json", "scheme": "file", "format": "json", @@ -596,6 +641,13 @@ { "name": "cars.json", "type": "table", + "description": "Collection of car specifications and performance metrics from various automobile manufacturers.", + "sources": [ + { + "title": "StatLib Datasets Archive", + "path": "http://lib.stat.cmu.edu/datasets/" + } + ], "path": "cars.json", "scheme": "file", "format": "json", @@ -650,6 +702,13 @@ { "name": "co2-concentration.csv", "type": "table", + "description": "Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. \nOnly includes rows with valid data.", + "sources": [ + { + "title": "Scripps CO2 Program", + "path": "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record" + } + ], "path": "co2-concentration.csv", "scheme": "file", "format": "csv", @@ -675,6 +734,25 @@ { "name": "countries.json", "type": "table", + "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/\ndocumentation/) notes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", + "licenses": [ + { + "title": "Creative Commons Attribution 4.0 International", + "path": "https://www.gapminder.org/free-material/" + } + ], + "sources": [ + { + "title": "Gapminder Foundation - Life Expectancy", + "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", + "version": "14" + }, + { + "title": "Gapminder Foundation - Fertility", + "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", + "version": "14" + } + ], "path": "countries.json", "scheme": "file", "format": "json", @@ -693,27 +771,33 @@ }, { "name": "year", - "type": "integer" + "type": "integer", + "description": "Years from 1955 to 2000 at 5-year intervals" }, { "name": "fertility", - "type": "number" + "type": "number", + "description": "Fertility rate (average number of children per woman) for the given year" }, { "name": "life_expect", - "type": "number" + "type": "number", + "description": "Life expectancy in years for the given year" }, { "name": "n_fertility", - "type": "number" + "type": "number", + "description": "Fertility rate for the next 5-year interval" }, { "name": "n_life_expect", - "type": "number" + "type": "number", + "description": "Life expectancy for the next 5-year interval" }, { "name": "country", - "type": "string" + "type": "string", + "description": "Name of the country" } ] } @@ -755,6 +839,13 @@ { "name": "disasters.csv", "type": "table", + "description": "Annual number of deaths from disasters.", + "sources": [ + { + "title": "Our World in Data - Natural Catastrophes", + "path": "https://ourworldindata.org/natural-catastrophes" + } + ], "path": "disasters.csv", "scheme": "file", "format": "csv", @@ -780,6 +871,12 @@ { "name": "driving.json", "type": "table", + "sources": [ + { + "title": "New York Times", + "path": "https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html" + } + ], "path": "driving.json", "scheme": "file", "format": "json", @@ -814,6 +911,13 @@ { "name": "earthquakes.json", "type": "json", + "description": "Earthquake data retrieved Feb 6, 2018", + "sources": [ + { + "title": "USGS Earthquake Feed", + "path": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson" + } + ], "path": "earthquakes.json", "scheme": "file", "format": "geojson", @@ -823,6 +927,7 @@ { "name": "ffox.png", "type": "file", + "description": "Application icons from open-source software projects.", "path": "ffox.png", "scheme": "file", "format": "png", @@ -884,6 +989,13 @@ { "name": "flights-10k.json", "type": "table", + "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", + "sources": [ + { + "title": "U.S. Bureau of Transportation Statistics", + "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + } + ], "path": "flights-10k.json", "scheme": "file", "format": "json", @@ -922,6 +1034,13 @@ { "name": "flights-200k.arrow", "type": "table", + "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", + "sources": [ + { + "title": "U.S. Bureau of Transportation Statistics", + "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + } + ], "path": "flights-200k.arrow", "scheme": "file", "format": ".arrow", @@ -946,6 +1065,13 @@ { "name": "flights-200k.json", "type": "table", + "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", + "sources": [ + { + "title": "U.S. Bureau of Transportation Statistics", + "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + } + ], "path": "flights-200k.json", "scheme": "file", "format": "json", @@ -976,6 +1102,13 @@ { "name": "flights-20k.json", "type": "table", + "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", + "sources": [ + { + "title": "U.S. Bureau of Transportation Statistics", + "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + } + ], "path": "flights-20k.json", "scheme": "file", "format": "json", @@ -1014,6 +1147,13 @@ { "name": "flights-2k.json", "type": "table", + "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", + "sources": [ + { + "title": "U.S. Bureau of Transportation Statistics", + "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + } + ], "path": "flights-2k.json", "scheme": "file", "format": "json", @@ -1052,6 +1192,13 @@ { "name": "flights-3m.parquet", "type": "table", + "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", + "sources": [ + { + "title": "U.S. Bureau of Transportation Statistics", + "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + } + ], "path": "flights-3m.parquet", "scheme": "file", "format": "parquet", @@ -1084,6 +1231,13 @@ { "name": "flights-5k.json", "type": "table", + "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", + "sources": [ + { + "title": "U.S. Bureau of Transportation Statistics", + "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + } + ], "path": "flights-5k.json", "scheme": "file", "format": "json", @@ -1122,6 +1276,13 @@ { "name": "flights-airport.csv", "type": "table", + "description": "Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py`", + "sources": [ + { + "title": "U.S. Bureau of Transportation Statistics", + "path": "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" + } + ], "path": "flights-airport.csv", "scheme": "file", "format": "csv", @@ -1147,6 +1308,13 @@ { "name": "football.json", "type": "table", + "description": "Football match outcomes across multiple divisions from 2013 to 2017, part of a\nlarger dataset from OpenFootball. The subset was made such that there are records for all five\nchosen divisions over the time period.", + "sources": [ + { + "title": "OpenFootball", + "path": "https://github.com/openfootball/football.json" + } + ], "path": "football.json", "scheme": "file", "format": "json", @@ -1189,6 +1357,23 @@ { "name": "gapminder-health-income.csv", "type": "table", + "description": "Per-capita income, life expectancy, population and regional grouping. Dataset does not specify \nthe reference year for the data. Gapminder historical data is subject to revisions.\n\nGapminder (v30, 2023) defines per-capita income as follows:\n>\"This is real GDP per capita (gross domestic product per person adjusted for inflation) \n>converted to international dollars using purchasing power parity rates. An international dollar \n>has the same purchasing power over GDP as the U.S. dollar has in the United States.\"\n", + "licenses": [ + { + "title": "Creative Commons Attribution 4.0 International", + "path": "https://www.gapminder.org/free-material/" + } + ], + "sources": [ + { + "title": "Gapminder Foundation", + "path": "https://www.gapminder.org" + }, + { + "title": "Gapminder GDP Per Capita Data", + "path": "https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268" + } + ], "path": "gapminder-health-income.csv", "scheme": "file", "format": "csv", @@ -1222,6 +1407,49 @@ { "name": "gapminder.json", "type": "table", + "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n originally added to this collection in 2015. The specific criteria for country selection \n in that version are not known. Data for Aruba are no longer available in the new version. \n Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n preserve continuity with previous versions of this dataset, we have retained the column \n name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, \n 4: east_asia_pacific, 5: middle_east_north_africa`.", + "sources": [ + { + "title": "Gapminder Foundation - Life Expectancy (Data)", + "path": "https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676", + "version": "14" + }, + { + "title": "Gapminder Foundatio - Life Expectancy (Documentation)", + "path": "https://www.gapminder.org/data/documentation/gd004/" + }, + { + "title": "Gapminder Foundation - Population (Data)", + "path": "https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676", + "version": "7" + }, + { + "title": "Gapminder Foundation - Population (Documentation)", + "path": "https://www.gapminder.org/data/documentation/gd003/" + }, + { + "title": "Gapminder Foundation - Fertility (Data)", + "path": "https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676", + "version": "14" + }, + { + "title": "Gapminder Foundation - Fertility Documentation (Documentation)", + "path": "https://www.gapminder.org/data/documentation/gd008/" + }, + { + "title": "Gapminder Foundation - Data Geographies (Data)", + "path": "https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158", + "version": "2" + }, + { + "title": "Gapminder Foundation - Data Geographies (Documentation)", + "path": "https://www.gapminder.org/data/geo/" + }, + { + "title": "Gapminder Data Documentation", + "path": "https://www.gapminder.org/data/documentation/" + } + ], "path": "gapminder.json", "scheme": "file", "format": "json", @@ -1236,27 +1464,33 @@ "fields": [ { "name": "year", - "type": "integer" + "type": "integer", + "description": "Years from 1955 to 2005 at 5-year intervals" }, { "name": "country", - "type": "string" + "type": "string", + "description": "Name of the country" }, { "name": "cluster", - "type": "integer" + "type": "integer", + "description": "A categorical variable (values 0-5) grouping countries by region" }, { "name": "pop", - "type": "integer" + "type": "integer", + "description": "Population of the country" }, { "name": "life_expect", - "type": "number" + "type": "number", + "description": "Life expectancy in years" }, { "name": "fertility", - "type": "number" + "type": "number", + "description": "Fertility rate (average number of children per woman" } ] } @@ -1264,6 +1498,7 @@ { "name": "gimp.png", "type": "file", + "description": "Application icons from open-source software projects.", "path": "gimp.png", "scheme": "file", "format": "png", @@ -1273,6 +1508,7 @@ { "name": "github.csv", "type": "table", + "description": "Generated using `/scripts/github.py`.", "path": "github.csv", "scheme": "file", "format": "csv", @@ -1294,6 +1530,13 @@ { "name": "global-temp.csv", "type": "table", + "description": "Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023.", + "sources": [ + { + "title": "NASA Goddard Institute for Space Studies", + "path": "https://data.giss.nasa.gov/gistemp/" + } + ], "path": "global-temp.csv", "scheme": "file", "format": "csv", @@ -1357,6 +1600,13 @@ { "name": "iowa-electricity.csv", "type": "table", + "description": "The state of Iowa has dramatically increased its production of renewable \nwind power in recent years. This file contains the annual net generation of electricity in \nthe state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. \nIt is useful for illustrating stacked area charts.", + "sources": [ + { + "title": "U.S. Energy Information Administration", + "path": "https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin=" + } + ], "path": "iowa-electricity.csv", "scheme": "file", "format": "csv", @@ -1382,6 +1632,14 @@ { "name": "jobs.json", "type": "table", + "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) (person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). The organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", + "sources": [ + { + "title": "IPUMS USA", + "path": "https://usa.ipums.org/usa/", + "version": "6.0" + } + ], "path": "jobs.json", "scheme": "file", "format": "json", @@ -1396,23 +1654,28 @@ "fields": [ { "name": "job", - "type": "string" + "type": "string", + "description": "The occupation title" }, { "name": "sex", - "type": "string" + "type": "string", + "description": "Sex (men/women)" }, { "name": "year", - "type": "integer" + "type": "integer", + "description": "Census year" }, { "name": "count", - "type": "integer" + "type": "integer", + "description": "Number of individuals in the occupation" }, { "name": "perc", - "type": "number" + "type": "number", + "description": "Percentage of the workforce in the occupation" } ] } @@ -1420,6 +1683,13 @@ { "name": "la-riots.csv", "type": "table", + "description": "More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles \nfor five days starting on April 29, 1992. This file contains metadata about each person, including the geographic \ncoordinates of their death. Compiled and published by the Los Angeles Times Data Desk.", + "sources": [ + { + "title": "LA Riots Deaths, Los Angeles Times Data Desk", + "path": "http://spreadsheets.latimes.com/la-riots-deaths/" + } + ], "path": "la-riots.csv", "scheme": "file", "format": "csv", @@ -1477,6 +1747,13 @@ { "name": "londonboroughs.json", "type": "json", + "description": "Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. \nOriginal data \"contains National Statistics data © Crown copyright and database right (2015)\" \nand \"Contains Ordnance Survey data © Crown copyright and database right [2015].", + "sources": [ + { + "title": "Statistical GIS Boundary Files, London Datastore", + "path": "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london" + } + ], "path": "londonBoroughs.json", "scheme": "file", "format": "topojson", @@ -1486,6 +1763,7 @@ { "name": "londoncentroids.json", "type": "table", + "description": "Calculated from `londongBoroughs.json` using `d3.geoCentroid`.", "path": "londonCentroids.json", "scheme": "file", "format": "json", @@ -1516,6 +1794,13 @@ { "name": "londontubelines.json", "type": "json", + "description": "Selected rail lines simplified from source.", + "sources": [ + { + "title": "London Tube Data", + "path": "https://github.com/oobrien/vis/tree/master/tube/data" + } + ], "path": "londonTubeLines.json", "scheme": "file", "format": "topojson", @@ -1580,6 +1865,17 @@ { "name": "monarchs.json", "type": "table", + "description": "A chronological list of English and British monarchs from Elizabeth I through George IV.\nEach entry includes:\n\nThe dataset contains two intentional inaccuracies to maintain compatibility with \nthe [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization:\n1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558;\n2. the end date for the reign of George IV is shown as 1820, instead of 1830.\nThese discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization.\nThe entry \"W&M\" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, \nthe official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702.\nThe `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, \nand the period leading to the Restoration. While historically more accurate to call this the \"interregnum,\" the field name of `commonwealth` \nfrom the original dataset is retained for backwards compatibility.\nThe dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689).\nSource data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024).\nContent on the site is protected by Crown Copyright. \nUnder the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most \nCrown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/).", + "sources": [ + { + "title": "The Royal Family - Kings & Queens", + "path": "https://www.royal.uk/kings-and-queens-1066" + }, + { + "title": "The Royal Family - Interregnum", + "path": "https://www.royal.uk/interregnum-1649-1660" + } + ], "path": "monarchs.json", "scheme": "file", "format": "json", @@ -1594,19 +1890,23 @@ "fields": [ { "name": "name", - "type": "string" + "type": "string", + "description": "The ruler's name or identifier (e.g., \"W&M\" for William and Mary, \"Cromwell\" for the period of interregnum)" }, { "name": "start", - "type": "integer" + "type": "integer", + "description": "The year their rule began" }, { "name": "end", - "type": "integer" + "type": "integer", + "description": "The year their rule ended" }, { "name": "index", - "type": "integer" + "type": "integer", + "description": "A zero-based sequential number assigned to each entry, representing the chronological order of rulers" } ] } @@ -1614,6 +1914,7 @@ { "name": "movies.json", "type": "table", + "description": "The dataset has well known and intentionally included errors. \nThis dataset is provided for instructional purposes, including the need to reckon with dirty data.", "path": "movies.json", "scheme": "file", "format": "json", @@ -1752,6 +2053,13 @@ { "name": "ohlc.json", "type": "table", + "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/\n%5EVIX#overview)) in the summer of 2009.", + "sources": [ + { + "title": "Yahoo Finance VIX Data", + "path": "https://finance.yahoo.com/chart/%5EVIX" + } + ], "path": "ohlc.json", "scheme": "file", "format": "json", @@ -1798,6 +2106,17 @@ { "name": "penguins.json", "type": "table", + "description": "Palmer Archipelago (Antarctica) penguin data collected and made available by \n[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) \nand the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research \nNetwork](https://lternet.edu/).", + "sources": [ + { + "title": "Palmer Station Antarctica LTER", + "path": "https://pal.lternet.edu/" + }, + { + "title": "Allison Horst's Penguins Repository", + "path": "https://github.com/allisonhorst/penguins" + } + ], "path": "penguins.json", "scheme": "file", "format": "json", @@ -1844,6 +2163,13 @@ { "name": "platformer-terrain.json", "type": "table", + "description": "Assets from the video game Celeste.", + "sources": [ + { + "title": "Celeste Game", + "path": "http://www.celestegame.com/" + } + ], "path": "platformer-terrain.json", "scheme": "file", "format": "json", @@ -1920,6 +2246,17 @@ { "name": "political-contributions.json", "type": "table", + "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/\ncampaign-finance-data/all-candidates-file-description/). The sample dataset in \n`political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://\n> creativecommons.org/publicdomain/zero/1.0/) dedication. Read more on our license page. \n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", + "sources": [ + { + "title": "Federal Election Commission Bulk Data", + "path": "https://www.fec.gov/data/browse-data/?tab=bulk-data" + }, + { + "title": "OpenFEC API", + "path": "https://api.open.fec.gov/developers/" + } + ], "path": "political-contributions.json", "scheme": "file", "format": "json", @@ -2038,6 +2375,13 @@ { "name": "population.json", "type": "table", + "description": "United States population statistics by sex and age group across decades between 1850 and 2000. \nThe dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census \nmicrodata\" from as early as 1790.\n\nIPUMS updates and revises datasets over time, which may result in discrepancies between this \ndataset and current IPUMS data. Details on data revisions are available here.\n\nWhen using this dataset, please refer to IPUMS USA terms of use. The organization requests the \nuse of the following citation for this json file:\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated \nPublic Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. \nhttp://doi.org/10.18128/D010.V6.0\n", + "sources": [ + { + "title": "IPUMS USA", + "path": "https://usa.ipums.org/usa/" + } + ], "path": "population.json", "scheme": "file", "format": "json", @@ -2052,19 +2396,23 @@ "fields": [ { "name": "year", - "type": "integer" + "type": "integer", + "description": "Four-digit year of the survey" }, { "name": "age", - "type": "integer" + "type": "integer", + "description": "Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+)" }, { "name": "sex", - "type": "integer" + "type": "integer", + "description": "Sex (1=men, 2=women)" }, { "name": "people", - "type": "integer" + "type": "integer", + "description": "Number of individuals (IPUMS PERWT)" } ] } @@ -2072,6 +2420,21 @@ { "name": "population_engineers_hurricanes.csv", "type": "table", + "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", + "sources": [ + { + "title": "Bureau of Labor Statistics", + "path": "https://www.bls.gov/oes/tables.htm" + }, + { + "title": "American Community Survey", + "path": "https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table" + }, + { + "title": "NOAA National Climatic Data Center", + "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records" + } + ], "path": "population_engineers_hurricanes.csv", "scheme": "file", "format": "csv", @@ -2105,6 +2468,13 @@ { "name": "seattle-weather-hourly-normals.csv", "type": "table", + "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/\ndocumentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure \nand updated the format to be easier to parse.", + "sources": [ + { + "title": "NOAA National Climatic Data Center (NCDC)", + "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/normals" + } + ], "path": "seattle-weather-hourly-normals.csv", "scheme": "file", "format": "csv", @@ -2134,6 +2504,13 @@ { "name": "seattle-weather.csv", "type": "table", + "description": "Daily weather records with metric units. Transformed using `/scripts/weather.py`. \nThe categorical \"weather\" field is synthesized from multiple fields in the original dataset. \nThis data is intended for instructional purposes.", + "sources": [ + { + "title": "NOAA National Climatic Data Center", + "path": "https://www.ncdc.noaa.gov/cdo-web/datatools/records" + } + ], "path": "seattle-weather.csv", "scheme": "file", "format": "csv", @@ -2171,6 +2548,13 @@ { "name": "sp500-2000.csv", "type": "table", + "description": "S&P 500 index values from 2000 to 2020.", + "sources": [ + { + "title": "Yahoo Finance", + "path": "https://finance.yahoo.com/quote/%5EDJI/history/" + } + ], "path": "sp500-2000.csv", "scheme": "file", "format": "csv", @@ -2284,6 +2668,21 @@ { "name": "unemployment-across-industries.json", "type": "table", + "description": "Industry-level unemployment statistics from the Current Population Survey \n(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons \nand unemployment rate across 11 private industries, as well as agricultural, government, and \nself-employed workers. Covers January 2000 through February 2010. Industry classification \nfollows format of CPS Table A-31.\n\nThe dataset can be replicated using the BLS API. For more, see the `scripts` folder of this \nrepository.\n\nThe BLS Web site states:\n> \"Users of the public API should cite the date that data were accessed or retrieved using \n> the API. Users must clearly state that \"BLS.gov cannot vouch for the data or analyses \n> derived from these data after the data have been retrieved from BLS.gov.\" The BLS.gov logo \n> may not be used by persons who are not BLS employees or on products (including web pages) \n> that are not BLS-sponsored.\"\n\nSee full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm).", + "sources": [ + { + "title": "U.S. Census Bureau Current Population Survey", + "path": "https://www.census.gov/programs-surveys/cps.html" + }, + { + "title": "BLS LAUS Data Tools", + "path": "https://www.bls.gov/lau/data.htm" + }, + { + "title": "Bureau of Labor Statistics Table A-31", + "path": "https://www.bls.gov/web/empsit/cpseea31.htm" + } + ], "path": "unemployment-across-industries.json", "scheme": "file", "format": "json", @@ -2298,27 +2697,33 @@ "fields": [ { "name": "series", - "type": "string" + "type": "string", + "description": "Industry name" }, { "name": "year", - "type": "integer" + "type": "integer", + "description": "Year (2000-2010)" }, { "name": "month", - "type": "integer" + "type": "integer", + "description": "Month (1-12)" }, { "name": "count", - "type": "integer" + "type": "integer", + "description": "Number of unemployed persons (in thousands)" }, { "name": "rate", - "type": "number" + "type": "number", + "description": "Unemployment rate (percentage)" }, { "name": "date", - "type": "datetime" + "type": "datetime", + "description": "ISO 8601-formatted date string (e.g., \"2000-01-01T08:00:00.000Z\")" } ] } @@ -2326,6 +2731,17 @@ { "name": "unemployment.tsv", "type": "table", + "description": "This dataset contains county-level unemployment rates in the United States, with data generally\nconsistent with levels reported in 2009. The dataset is structured as tab-separated values.\nThe unemployment rate represents the number of unemployed persons as a percentage of the labor\nforce. According to the Bureau of Labor Statistics (BLS) glossary:\n\nUnemployed persons (Current Population Survey) [are] persons aged 16 years and older who had\nno employment during the reference week, were available for work, except for temporary\nillness, and had made specific efforts to find employment sometime during the 4-week period\nending with the reference week. Persons who were waiting to be recalled to a job from which\nthey had been laid off need not have been looking for work to be classified as unemployed.\n\nThis dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, \na federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). \nThe LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions,\nstates, counties, metropolitan areas, and many cities and towns.\n\nFor the most up-to-date LAUS data:\n1. **Monthly and Annual Data Downloads**:\n- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) \nand [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data.\n2. **BLS Public Data API**:\n- The BLS provides an API for developers to access various datasets, including LAUS data.\n- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query.\n- API documentation and examples are available on the BLS Developers page.\n\nWhen using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm).", + "sources": [ + { + "title": "BLS Developers API", + "path": "https://www.bls.gov/developers/" + }, + { + "title": "BLS Handbook of Methods", + "path": "https://www.bls.gov/opub/hom/lau/home.htm" + } + ], "path": "unemployment.tsv", "scheme": "file", "format": "tsv", @@ -2340,11 +2756,13 @@ "fields": [ { "name": "id", - "type": "integer" + "type": "integer", + "description": "The combined state and county FIPS code" }, { "name": "rate", - "type": "number" + "type": "number", + "description": "The unemployment rate for the county" } ] } @@ -2387,6 +2805,13 @@ { "name": "us-employment.csv", "type": "table", + "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/\nce.supersector) tracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", + "sources": [ + { + "title": "U.S. Bureau of Labor Statistics Current Employment Statistics", + "path": "https://www.bls.gov/ces/" + } + ], "path": "us-employment.csv", "scheme": "file", "format": "csv", @@ -2530,6 +2955,13 @@ { "name": "volcano.json", "type": "json", + "description": "Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. \nThis data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a \ntopographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.", + "sources": [ + { + "title": "R Datasets", + "path": "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html" + } + ], "path": "volcano.json", "scheme": "file", "format": "json", @@ -2539,6 +2971,13 @@ { "name": "weather.csv", "type": "table", + "description": "NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized \nfrom multiple fields in the original dataset. This data is intended for instructional purposes.", + "sources": [ + { + "title": "NOAA Climate Data Online", + "path": "http://www.ncdc.noaa.gov/cdo-web/datatools/findstation" + } + ], "path": "weather.csv", "scheme": "file", "format": "csv", @@ -2580,6 +3019,7 @@ { "name": "weather.json", "type": "json", + "description": "Instructional dataset showing actual and predicted temperature data.", "path": "weather.json", "scheme": "file", "format": "json", @@ -2589,6 +3029,13 @@ { "name": "wheat.json", "type": "table", + "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/\nwiki/William_Playfair), a Scottish engineer who is often credited as the founder of \nstatistical graphics, published an elegant chart on the price of wheat. It plots 250 years \nof prices alongside weekly wages and the reigning monarch. He intended to demonstrate that \n\"never at any former period was wheat so cheap, in proportion to mechanical labour, as it \nis at the present time.\"", + "sources": [ + { + "title": "1822 Playfair Chart", + "path": "http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg" + } + ], "path": "wheat.json", "scheme": "file", "format": "json", @@ -2619,6 +3066,7 @@ { "name": "windvectors.csv", "type": "table", + "description": "Simulated wind patterns over northwestern Europe.", "path": "windvectors.csv", "scheme": "file", "format": "csv", @@ -2661,6 +3109,13 @@ { "name": "zipcodes.csv", "type": "table", + "description": "GeoNames.org", + "sources": [ + { + "title": "GeoNames", + "path": "https://www.geonames.org" + } + ], "path": "zipcodes.csv", "scheme": "file", "format": "csv", diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index 856cdb04..de2dae98 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -39,10 +39,10 @@ import logging import os import warnings -from collections.abc import Mapping +from collections.abc import Mapping, Sequence from functools import partial from pathlib import Path -from typing import TYPE_CHECKING, NotRequired, Required, TypedDict, Unpack +from typing import TYPE_CHECKING, Any, NotRequired, Required, TypedDict, Unpack, cast import frictionless as fl import polars as pl @@ -68,7 +68,7 @@ ) if TYPE_CHECKING: - from collections.abc import Callable, Iterator, Sequence + from collections.abc import Callable, Iterator from typing import ClassVar, Literal @@ -200,12 +200,32 @@ def _extract_file_parts(cls, source: Path, /) -> dict[PathMeta, str]: @staticmethod def with_extras(resource: Resource, /, **extras: Unpack[ResourceMeta]) -> Resource: - """TODO: Use as part of https://github.com/vega/vega-datasets/pull/631#issuecomment-2503760452""" + """Supplement inferred metadata with manually defined ``extras``.""" + if "schema" in extras: + resource.schema = merge_schemas(resource, extra=extras.pop("schema")) for name, value in extras.items(): setattr(resource, name, value) return resource +def merge_schemas(resource: Resource, *, extra: Schema) -> fl.Schema: + if schema := resource.schema: + inferred = _flatten_schema(cast("Schema", schema.to_dict())) + else: + return fl.Schema.from_descriptor(cast("dict[str, Any]", extra)) + overrides = _flatten_schema(extra) + fields = [] + for name, field in inferred.items(): + if name in overrides: + field.update(overrides[name]) + fields.append(field) + return fl.Schema.from_descriptor({"fields": fields}) + + +def _flatten_schema(schema: Schema, /) -> dict[str, Field]: + return {field["name"]: field for field in schema["fields"]} + + class Source(TypedDict, total=False): title: str path: Required[str] @@ -229,10 +249,25 @@ class Contributor(TypedDict, total=False): organization: str +class Field(TypedDict, total=False): + """https://datapackage.org/standard/table-schema/#field.""" + + name: Required[str] + type: str + description: str + + +class Schema(TypedDict): + """https://datapackage.org/standard/table-schema/#properties.""" + + fields: Sequence[Field] + + class ResourceMeta(TypedDict, total=False): description: str sources: Sequence[Source] licenses: Sequence[License] + schema: Schema class PackageMeta(TypedDict): @@ -294,12 +329,51 @@ def extract_package_metadata(repo_root: Path, /) -> PackageMeta: ) -def iter_resources(data_root: Path, /) -> Iterator[Resource]: - """Yield all parseable resources, selecting the most appropriate ``Resource`` class.""" +def extract_overrides(mapping: Mapping[str, Any], /) -> dict[str, ResourceMeta]: + if (resources := mapping.get("resources")) and isinstance(resources, Sequence): + return dict(iter_parse_resources(resources)) + else: + raise TypeError(resources) + + +def iter_parse_resources( + seq: Sequence[ResourceMeta], / +) -> Iterator[tuple[str, ResourceMeta]]: + for resource in seq: + if (name := resource.get("path")) and isinstance(name, str): + m: Any = {k: v for k, v in resource.items() if k != "path"} + # NOTE: Drops entries that only provide `path` + if m: + yield name, m + else: + raise TypeError(seq) + + +def iter_data_dir(data_root: Path, /) -> Iterator[Path]: + """Yield files in the root of the ``/data/`` directory.""" for fp in sorted(data_root.iterdir()): - if not fp.is_file(): - continue + if fp.is_file(): + yield fp + + +def iter_resources( + root: Path, /, overrides: dict[str, ResourceMeta] +) -> Iterator[Resource]: + """ + Yield all parseable resources, constructing with the most appropriate ``Resource`` class. + + Parameters + ---------- + root + Directory storing datasets. + overrides + Additional metadata, with a higher precedence than inferred. + """ + for fp in iter_data_dir(root): if resource := ResourceAdapter.from_path(fp): + name = fp.name + if name in overrides: + resource = ResourceAdapter.with_extras(resource, **overrides[name]) yield resource else: msg = f"Skipping unexpected extension {fp.suffix!r}\n\n{fp!r}" @@ -307,6 +381,12 @@ def iter_resources(data_root: Path, /) -> Iterator[Resource]: continue +def read_toml(fp: Path, /) -> Mapping[str, Any]: + import tomllib + + return tomllib.loads(fp.read_text("utf-8")) + + def main( *, stem: str = "datapackage", @@ -317,6 +397,11 @@ def main( raise TypeError(msg) repo_dir: Path = Path(__file__).parent.parent data_dir: Path = repo_dir / "data" + sources_toml: Path = repo_dir / "SOURCES.toml" + + sources = read_toml(sources_toml) + # NOTE: Package metadata is expected to be stored in `sources` in the future + overrides = extract_overrides(sources) # NOTE: Forcing base directory here # - Ensures ``frictionless`` doesn't insert platform-specific path separator(s) os.chdir(data_dir) @@ -324,7 +409,7 @@ def main( logger.info( f"Collecting resources for '{pkg_meta['name']}@{pkg_meta['version']}' ..." ) - pkg = Package(resources=list(iter_resources(data_dir)), **pkg_meta) # type: ignore[arg-type] + pkg = Package(resources=list(iter_resources(data_dir, overrides)), **pkg_meta) # type: ignore[arg-type] logger.info(f"Collected {len(pkg.resources)} resources") if output_format in {"json", "both"}: p = (repo_dir / f"{stem}.json").as_posix() From 6011269e9545b631ca45f206d0517587bfe09290 Mon Sep 17 00:00:00 2001 From: Daniel Sorid <63077097+dsmedia@users.noreply.github.com> Date: Thu, 12 Dec 2024 07:18:42 -0500 Subject: [PATCH 11/40] restore URL to single line Co-authored-by: Dan Redding <125183946+dangotbanned@users.noreply.github.com> --- SOURCES.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SOURCES.toml b/SOURCES.toml index ab89c191..6e5dcb1c 100644 --- a/SOURCES.toml +++ b/SOURCES.toml @@ -662,8 +662,8 @@ FEC data is subject to the commission's: Additionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states: > This project is in the public domain within the United States, and we waive worldwide -> copyright and related rights through [CC0 universal public domain](https:// -> creativecommons.org/publicdomain/zero/1.0/) dedication. Read more on our license page. +> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/) +> dedication. Read more on our license page. > A few restrictions limit the way you can use FEC data. For example, you can't use > contributor lists for commercial purposes or to solicit donations. Learn more on > [FEC.gov](https://www.fec.gov/).""" From e5121d8eac2695049b076a90e00159fa5187ad4f Mon Sep 17 00:00:00 2001 From: ds <63077097+dsmedia@users.noreply.github.com> Date: Thu, 12 Dec 2024 07:35:47 -0500 Subject: [PATCH 12/40] fix: avoid breaks in URL and backtick-enclosed strings --- SOURCES.toml | 52 +++++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/SOURCES.toml b/SOURCES.toml index 6e5dcb1c..3f8f3157 100644 --- a/SOURCES.toml +++ b/SOURCES.toml @@ -80,8 +80,8 @@ The vega-datsets version is largely consistent with the Protovis version of the correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a \ new column, 'Genus', to group related bacterial species together. -The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/\ -wmgda_8616c.jpg) reads as follows: +The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) +reads as follows: > ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin > @@ -135,8 +135,8 @@ path = "countries.json" description = """This dataset combines key demographic indicators (life expectancy at birth and fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year intervals. It includes both current values and adjacent time period values (previous and next) -for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/ -documentation/) notes that its philosophy is to fill data gaps with estimates and use current +for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) +notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to "show people the big picture" rather than support detailed numeric analysis.""" @@ -341,8 +341,7 @@ Notes: corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To preserve continuity with previous versions of this dataset, we have retained the column name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: - `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, - 4: east_asia_pacific, 5: middle_east_north_africa`.""" + `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.""" [resources.schema] [[resources.schema.fields]] @@ -447,9 +446,8 @@ sex and year across decades between 1850 and 2000. The dataset was obtained from "collects, preserves and harmonizes U.S. census microdata" from as early as 1790. Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, \ -Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/\ -bdata_ch12.pdf). The dataset is also referenced in this vega [example](https://vega.github.io/vega/\ -examples/job-voyager/). +Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). +The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) \ variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of \ @@ -467,18 +465,16 @@ underlying microdata records are not shared. This dataset contains only summary statistics and does not include any underlying microdata records. 1. This dataset represents summary data. The underlying microdata records are not included. -2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/\ -usa-action/variables/PERWT#description_section) (person weight) variable as an expansion factor when \ -working with IPUMS USA extracts. +2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) +(person weight) variable as an expansion factor when working with IPUMS USA extracts. 3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current \ IPUMS USA data exactly. -When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/\ -terms.shtml). The organization requests use of the following citation for this json file: +When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). +The organization requests use of the following citation for this json file: Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use \ -Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/\ -D010.V6.0 +Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0 """ [resources.schema] @@ -651,9 +647,8 @@ description = """Summary financial information on contributions to candidates fo elections. An updated version of this datset is available from the "all candidates" files (in pipe-delimited format) on the bulk data download page of the U.S. Federal Election Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is -available from the [FEC All Candidates File Description](https://www.fec.gov/ -campaign-finance-data/all-candidates-file-description/). The sample dataset in -`political-contributions.json` contains 58 records with dates from 2015. +available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). +The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. FEC data is subject to the commission's: - [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/) @@ -732,8 +727,8 @@ path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" path = "seattle-weather-hourly-normals.csv" description = """Hourly weather normals with metric units. The 1981-2010 Climate Normals are NCDC's three-decade averages of climatological variables, including temperature and -precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/ -documentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure +precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). +We only included temperature, wind, and pressure and updated the format to be easier to parse.""" [[resources.sources]] @@ -887,8 +882,8 @@ This file contains the monthly employment total in a variety of job categories f through December 2015. The numbers are seasonally adjusted and reported in thousands. The data were downloaded on Nov. 11, 2018, and reformatted for use in this library. -Totals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ -ce.supersector) tracked by the BLS. The \"nonfarm\" total is the category typically used by +Totals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector) +tracked by the BLS. The \"nonfarm\" total is the category typically used by economists and journalists as a stand-in for the country's employment total. A calculated \"nonfarm_change\" column has been appended with the month-to-month change in that @@ -927,12 +922,11 @@ description = """Instructional dataset showing actual and predicted temperature [[resources]] # Path: wheat.json path = "wheat.json" -description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/ -wiki/William_Playfair), a Scottish engineer who is often credited as the founder of -statistical graphics, published an elegant chart on the price of wheat. It plots 250 years -of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that -"never at any former period was wheat so cheap, in proportion to mechanical labour, as it -is at the present time."""" +description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), +a Scottish engineer who is often credited as the founder of statistical graphics, +published an elegant chart on the price of wheat. It plots 250 years of prices alongside +weekly wages and the reigning monarch. He intended to demonstrate that "never at any former period +was wheat so cheap, in proportion to mechanical labour, as it is at the present time."""" [[resources.sources]] title = "1822 Playfair Chart" From 7259af46f62c826ef3b7842403d99b26592b69f0 Mon Sep 17 00:00:00 2001 From: ds <63077097+dsmedia@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:43:27 -0500 Subject: [PATCH 13/40] Remove SOURCES.md in favor of automated generation --- SOURCES.md | 423 ----------------------------------------------------- 1 file changed, 423 deletions(-) delete mode 100644 SOURCES.md diff --git a/SOURCES.md b/SOURCES.md deleted file mode 100644 index 520a8001..00000000 --- a/SOURCES.md +++ /dev/null @@ -1,423 +0,0 @@ -# Sources - -Still incomplete. See https://github.com/vega/vega-datasets/issues/15. - -## `7zip.png`, `ffox.png`, `gimp.png` - -Application icons from open-source software projects. - -## `annual-precip.json` - -A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell, from [CFSv2](https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2). - -## `airports.csv` - -## `anscombe.json` - -Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician. - -## `barley.json` - -The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper "Statistical Determination of Barley Varietal Adaption." R.A. Fisher's popularized its use in the field of statistics when he included it in his book ["The Design of Experiments."](https://en.wikipedia.org/wiki/The_Design_of_Experiments) Since then it has been used to demonstrate new statistical techniques, including the [trellis charts](http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf) developed by Richard Becker, William Cleveland and others in the 1990s. - -## `birdstrikes.csv` - -http://wildlife.faa.gov - -## `budget.json` - -Source: Office of Management and Budget (U.S.) -[Budget FY 2016 - Receipts](https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3) - -## `budgets.json` - -## `burtin.json` - -The burtin.json dataset is based on graphic designer [Will Burtin's](https://en.wikipedia.org/wiki/Will_Burtin) 1951 visualization of antibiotic effectiveness, originally published in [Scope Magazine](https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/). The dataset compares the performance of three antibiotics against 16 different bacteria. The numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. The dataset was featured as an [example](https://mbostock.github.io/protovis/ex/antibiotics-burtin.html) in the Protovis project, a precursor to D3.js. The Protovis example notes that, "Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin." The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. - -The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) reads as follows: - -> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin -> -> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin. The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits the test organism. High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness. It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis. Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin. It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood. - -## `cars.json` - -http://lib.stat.cmu.edu/datasets/ - -## `co2-concentration.csv` - -https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record but modified to only include date, CO2, seasonally adjusted CO2 and only include rows with valid data. - -## `countries.json` -### Source -- **Original Data**: [Gapminder Foundation](https://www.gapminder.org/) -- **URLs**: - - Life Expectancy (v14): [Data](https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd004/) - - Fertility (v14): [Data](https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd008/) - -- **Date Accessed**: July 31, 2024 -- **License**: Creative Commons Attribution 4.0 International (CC BY 4.0) | [Reference](https://www.gapminder.org/free-material/) - -### Description -This dataset combines key demographic indicators (life expectancy at birth and fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year intervals. It includes both current values and adjacent time period values (previous and next) for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to "show people the big picture" rather than support detailed numeric analysis. - -#### Columns: -1. `year` (type: integer): Years from 1955 to 2000 at 5-year intervals -2. `country` (type: string): Name of the country -3. `fertility` (type: float): Fertility rate (average number of children per woman) for the given year -4. `life_expect` (type: float): Life expectancy in years for the given year -5. `p_fertility` (type: float): Fertility rate for the previous 5-year interval -6. `n_fertility` (type: float): Fertility rate for the next 5-year interval -7. `p_life_expect` (type: float): Life expectancy for the previous 5-year interval -8. `n_life_expect` (type: float): Life expectancy for the next 5-year interval - -## `crimea.json` - -## `disasters.csv` - -https://ourworldindata.org/natural-catastrophes - -## `driving.json` - -https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html - -## `earthquakes.json` - -https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson -(Feb 6, 2018) - -## `flare.json`, `flare-dependencies.json` - -## `flights-?k.json`, `flights-200k.arrow`, `flights-airport.csv` - -Flight delay statistics from U.S. Bureau of Transportation Statistics, https://www.transtats.bts.gov/OT_Delay/OT_DelayCause1.asp. - -Transformed using `/scripts/flights.js`. Arrow file generated with [json2arrow](https://github.com/domoritz/arrow-tools/tree/main/crates/json2arrow). - -## `football.json` - -Football match outcomes across multiple divisions from 2013 to 2017. This dataset is a subset of a larger dataset from https://github.com/openfootball/football.json. The subset was made such that there are records for all five chosen divisions over the time period. - -## `gapminder.json` -### Source -- **Original Data**: [Gapminder Foundation](https://www.gapminder.org/) -- **URLs**: - - Life Expectancy (v14): [Data](https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd004/) - - Population (v7): [Data](https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd003/) - - Fertility (v14): [Data](https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676) | [Reference](https://www.gapminder.org/data/documentation/gd008/) - - Data Geographies (v2): [Data](https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158) | [Reference](https://www.gapminder.org/data/geo/) - -- **Date Accessed**: July 11, 2024 -- **License**: Creative Commons Attribution 4.0 International (CC BY 4.0) | [Reference](https://www.gapminder.org/free-material/) - -### Description -This dataset combines key demographic indicators (life expectancy at birth, population, and fertility rate measured as babies per woman) for various countries from 1955 to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable grouping countries. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to "show people the big picture" rather than support detailed numeric analysis. - -#### Columns: -1. `year` (type: integer): Years from 1955 to 2005 at 5-year intervals -2. `country` (type: string): Name of the country -3. `cluster` (type: integer): A categorical variable (values 0-5) grouping countries. See Revision Notes for details. -4. `pop` (type: integer): Population of the country -5. `life_expect` (type: float): Life expectancy in years -6. `fertility` (type: float): Fertility rate (average number of children per woman) - -### Revision Notes -1. Country Selection: The set of countries in this file matches the version of this dataset originally added to this collection in 2015. The specific criteria for country selection in that version are not known. Data for Aruba are no longer available in the new version. Hong Kong has been revised to Hong Kong, China in the new version. -2. Data Precision: The precision of float values may have changed from the original version. These changes reflect the most recent source data used for each indicator. -3. Regional Groupings: The 'cluster' column represents a regional mapping of countries corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To preserve continuity with previous versions of this dataset, we have retained the column name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: -`0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`. - -## `gapminder-health-income.csv` -**Original Data**: [Gapminder Foundation](https://www.gapminder.org/) - -**Description** Per-capita income, life expectancy, population and regional grouping. Dataset does not specify the reference year for the data. Gapminder historical data is subject to revisions. - -Gapminder (v30, 2023) defines per-capita income as follows: - ->"This is real GDP per capita (gross domestic product per person adjusted for inflation) converted to international dollars using purchasing power parity rates. An international dollar has the same purchasing power over GDP as the U.S. dollar has in the United States." | [Source](https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268) - -**License**: Creative Commons Attribution 4.0 International (CC BY 4.0) | [Reference](https://www.gapminder.org/free-material/) - -## `github.csv` - -Generated using `/scripts/github.py`. - -## `global-temp.csv` - -Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023. Source: NASA's Goddard Institute for Space Studies https://data.giss.nasa.gov/gistemp/ - -## `income.json` - -## `iowa-electricity.csv` - -The state of Iowa has dramatically increased its production of renewable wind power in recent years. This file contains the annual net generation of electricity in the state by source in thousand megawatthours. The dataset was compiled by the [U.S. Energy Information Administration](https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin=) and downloaded on May 6, 2018. It is useful for illustrating stacked area charts. - -## `jobs.json` -U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census microdata" from as early as 1790. - -Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). - -### Notes on Data Origin -Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of [IPUMS USA](https://usa.ipums.org/usa/), according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). - -### Data Structure -The dataset is structured as follows: -- job: The occupation title -- sex: Sex (men/women) -- year: Census year -- count: Number of individuals in the occupation -- perc: Percentage of the workforce in the occupation - -### Redistribution -IPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating: - ->We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared. - -This dataset contains only summary statistics and does not include any underlying microdata records. - -### Usage Notes -1. This dataset represents summary data. The underlying microdata records are not included. -2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) (person weight) variable as an expansion factor when working with IPUMS USA extracts. -3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly. - -### Terms of Use and Citation -When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). The organization requests use of the following citation for this json file: - -Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0 - - -## `la-riots.csv` - -More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic coordinates of their death. It was compiled and published by the [Los Angeles Times Data Desk](http://spreadsheets.latimes.com/la-riots-deaths/). - -## `londonBoroughs.json` - -Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile held at https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london. Original data "contains National Statistics data © Crown copyright and database right (2015)" and "Contains Ordnance Survey data © Crown copyright and database right [2015]. - -## `londonCentroids.json` - -Calculated from `londongBoroughs.json` using `d3.geoCentroid`. - -## `londonTubeLines.json` - -Selected rail lines simplified from `tfl_lines.json` at https://github.com/oobrien/vis/tree/master/tube/data - -## `lookup_groups.csv`, `lookup_people.csv` - -## `miserables.json` - -## `monarchs.json` - -A chronological list of English and British monarchs from Elizabeth I through George IV. -### Data Structure -Each entry includes: - -- `name`: The ruler's name or identifier (e.g., "W&M" for William and Mary, "Cromwell" for the period of interregnum) -- `start`: The year their rule began. -- `end`: The year their rule ended -- `index`: A [zero-based sequential number](https://en.wikipedia.org/wiki/Zero-based_numbering) assigned to each entry, representing the chronological order of rulers -- `commonwealth`: A Boolean flag (true) for the period from 1649 to 1660. This field is omitted for all other entries. - -### Known Inaccuracies and Special Notes - -#### Start and end dates -The dataset contains two intentional inaccuracies to maintain compatibility with the [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization: -1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558; -2. the end date for the reign of George IV is shown as 1820, instead of 1830. - -These discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization. - -#### William & Mary's Reign -The entry "W&M" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, the official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702. - -#### Interregnum Period -The `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, and the period leading to the Restoration. While historically more accurate to call this the "interregnum," the field name of `commonwealth` from the original dataset is retained for backwards compatibility. - -#### Recent updates - -The dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689). - - ### Data Source and Licensing - Source data has been verified against the [kings & queens](https://www.royal.uk/kings-and-queens-1066 -) and [interregnum](https://www.royal.uk/interregnum-1649-1660 -) [official website of the British royal family](https://www.royal.uk) pages of the official Web site of the British royal family (retrieved in Aug. 2024). Content on the site is protected by Crown Copyright. Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). - -## `movies.json` - -The dataset has well known and intentionally included errors. This dataset is used for instructional purposes, including the need to reckon with dirty data. - -## `normal-2d.json` - -## `obesity.json` - -## `ohlc.json` - -This dataset contains the performance of the Chicago Board Options Exchange [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX?ltr=1#eyJpbnRlcnZhbCI6ImRheSIsInBlcmlvZGljaXR5IjoxLCJ0aW1lVW5pdCI6bnVsbCwiY2FuZGxlV2lkdGgiOjgsInZvbHVtZVVuZGVybGF5Ijp0cnVlLCJhZGoiOnRydWUsImNyb3NzaGFpciI6dHJ1ZSwiY2hhcnRUeXBlIjoibGluZSIsImV4dGVuZGVkIjpmYWxzZSwibWFya2V0U2Vzc2lvbnMiOnt9LCJhZ2dyZWdhdGlvblR5cGUiOiJvaGxjIiwiY2hhcnRTY2FsZSI6ImxpbmVhciIsInN0dWRpZXMiOnsidm9sIHVuZHIiOnsidHlwZSI6InZvbCB1bmRyIiwiaW5wdXRzIjp7ImlkIjoidm9sIHVuZHIiLCJkaXNwbGF5Ijoidm9sIHVuZHIifSwib3V0cHV0cyI6eyJVcCBWb2x1bWUiOiIjMDBiMDYxIiwiRG93biBWb2x1bWUiOiIjRkYzMzNBIn0sInBhbmVsIjoiY2hhcnQiLCJwYXJhbWV0ZXJzIjp7IndpZHRoRmFjdG9yIjowLjQ1LCJjaGFydE5hbWUiOiJjaGFydCJ9fX0sInBhbmVscyI6eyJjaGFydCI6eyJwZXJjZW50IjoxLCJkaXNwbGF5IjoiXlZJWCIsImNoYXJ0TmFtZSI6ImNoYXJ0IiwidG9wIjowfX0sInNldFNwYW4iOnt9LCJsaW5lV2lkdGgiOjIsInN0cmlwZWRCYWNrZ3JvdWQiOnRydWUsImV2ZW50cyI6dHJ1ZSwiY29sb3IiOiIjMDA4MWYyIiwiZXZlbnRNYXAiOnsiY29ycG9yYXRlIjp7ImRpdnMiOnRydWUsInNwbGl0cyI6dHJ1ZX0sInNpZ0RldiI6e319LCJzeW1ib2xzIjpbeyJzeW1ib2wiOiJeVklYIiwic3ltYm9sT2JqZWN0Ijp7InN5bWJvbCI6Il5WSVgifSwicGVyaW9kaWNpdHkiOjEsImludGVydmFsIjoiZGF5IiwidGltZVVuaXQiOm51bGwsInNldFNwYW4iOnt9fV19)) in the summer of 2009. - -## `penguins.json` - -Palmer Archipelago (Antarctica) penguin data collected and made available by [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) and the [Palmer Station, Antarctica LTER](https://pal.lternet.edu/), a member of the [Long Term Ecological Research Network](https://lternet.edu/). For more information visit [allisonhorst/penguins](https://github.com/allisonhorst/penguins) on GitHub. - -## `platformer-terrain.json` - -Assets from the video game [Celeste](http://www.celestegame.com/). - -## `points.json` - -## `political-contributions.json` - -Summary financial information on contributions to candidates for U.S. elections. An updated version of this datset is available from the "all candidates" files (in pipe-delimited format) on the [bulk data download](https://www.fec.gov/data/browse-data/?tab=bulk-data) page of the U.S. Federal Election Commission, or, alternatively, via [OpenFEC](https://api.open.fec.gov/developers/). Information on each of the 25 columns is available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. - -### Terms of Use - -FEC data is subject to the commission's: -- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/) -- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/) -- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md) - -Additionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states: - -> This project is in the public domain within the United States, and we waive worldwide copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/) dedication. Read more on our license page. A few restrictions limit the way you can use FEC data. For example, you can't use contributor lists for commercial purposes or to solicit donations. Learn more on [FEC.gov](https://www.fec.gov/). - -## `population.json` -United States population statistics by sex and age group across decades between 1850 and 2000. The dataset was obtained from [IPUMS USA](https://usa.ipums.org/usa/), which "collects, preserves and harmonizes U.S. census microdata" from as early as 1790. - -### Data Structure -The dataset is structured as follows: -- year: four-digit year of the survey. - [IPUMS description](https://usa.ipums.org/usa-action/variables/YEAR#description_section) -- age: age group in 5-year intervals (0 represents ages 0-4, 5 represents 5-9, 10 represents 10-14, etc., up to 90 representing 90 and above) - [IPUMS description](https://usa.ipums.org/usa-action/variables/AGE#description_section) -- sex: Sex (men = 1 / women = 2) - [IPUMS description](https://usa.ipums.org/usa-action/variables/SEX#description_section) -- people: Number of individuals, equivalent to IPUMS variable name [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section). - -### Notes on Data Origin -IPUMS updates and revises datasets over time, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). - -### Terms of Use and Citation -When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). The organization requests the use of the following citation for this json file: - -Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0 - -## `population_engineers_hurricanes.csv` - -Data about engineers from https://www.bls.gov/oes/tables.htm. Hurricane data from http://www.nhc.noaa.gov/paststate.shtml. Income data from https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table. - -## `seattle-weather.csv` - -Data from [NOAA](https://www.ncdc.noaa.gov/cdo-web/datatools/records). Daily weather records with metric units. Transformed using `/scripts/weather.py`. We synthesized the categorical "weather" field from multiple fields in the original dataset. This data is intended for instructional purposes. - -## `seattle-weather-hourly-normals.csv` - -Data from [NOAA](https://www.ncdc.noaa.gov/cdo-web/datatools/normals). Hourly weather normals with metric units. The 1981-2010 Climate Normals are NCDC's three-decade averages of climatological variables, including temperature and precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure and updated the format to be easier to parse. - -## `sp500.csv` - -## `sp500-2000.csv` - -S&P 500 index values from 2000 to 2020, retrieved from [Yahoo Finance](https://finance.yahoo.com/quote/%5EDJI/history/). - -## `stocks.csv` - -## `udistrict.json` - -## `unemployment-across-industries.json` - -Industry-level unemployment statistics from the [Current Population Survey](https://www.census.gov/programs-surveys/cps.html) (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons and unemployment rate across 11 private industries, as well as agricultural, government, and self-employed workers. Covers January 2000 through February 2010. Industry classification follows format of CPS [Table A-31](https://www.bls.gov/web/empsit/cpseea31.htm). - -### Data Structure -Each entry in the JSON file contains: -- `series`: Industry name -- `year`: Year (2000-2010) -- `month`: Month (1-12) -- `count`: Number of unemployed persons (in thousands) -- `rate`: Unemployment rate (percentage) -- `date`: [ISO 8601](https://www.iso.org/iso-8601-date-and-time-format.html)-formatted date string (e.g., "2000-01-01T08:00:00.000Z") - -The dataset can be replicated using the BLS API. For more, see the `scripts` folder of this repository. - -### Citing Data -The BLS Web site states: -> "Users of the public API should cite the date that data were accessed or retrieved using the API. Users must clearly state that “BLS.gov cannot vouch for the data or analyses derived from these data after the data have been retrieved from BLS.gov.” The BLS.gov logo may not be used by persons who are not BLS employees or on products (including web pages) that are not BLS-sponsored." - -See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm). - -## `unemployment.tsv` - -This dataset contains county-level unemployment rates in the United States, with data generally consistent with levels reported in 2009. The dataset is structured as tab-separated values with two columns: - -1. `id`: The combined [state and county FIPS code](https://www.census.gov/library/reference/code-lists/ansi.html) -2. `rate`: The unemployment rate for the county - -The unemployment rate represents the number of unemployed persons as a percentage of the labor force. According to the [Bureau of Labor Statistics (BLS) glossary](https://www.bls.gov/opub/hom/glossary.htm#U): - -> Unemployed persons (Current Population Survey) [are] persons aged 16 years and older who had no employment during the reference week, were available for work, except for temporary illness, and had made specific efforts to find employment sometime during the 4-week period ending with the reference week. Persons who were waiting to be recalled to a job from which they had been laid off need not have been looking for work to be classified as unemployed. - -The labor force includes all persons classified as employed or unemployed in accordance with the BLS definitions. - -### Data Source - -This dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, a federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). The LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions, states, counties, metropolitan areas, and many cities and towns. - -### Accessing Current LAUS Data - -For the most up-to-date LAUS data: - -1. **Monthly and Annual Data Downloads**: - - Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) and [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data. - -2. **BLS Public Data API**: - - The BLS provides an [API for developers](https://www.bls.gov/developers/) to access various datasets, including LAUS data. - - To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query. - - API documentation and examples are available on the [BLS Developers](https://www.bls.gov/developers/) page. - -### Terms of Use - -When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm), which includes the following guidelines: - -1. Cite the date that data were accessed or retrieved. -2. Acknowledge that "BLS.gov cannot vouch for the data or analyses derived from these data after the data have been retrieved from BLS.gov." -3. Do not use the BLS logo without permission. - -For detailed methodology and technical information about LAUS estimates, refer to the [BLS Handbook of Methods](https://www.bls.gov/opub/hom/lau/home.htm). - -## `uniform-2d.json` - -## `us-10m.json` - -## `us-employment.csv` - -In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job losses across the United States. The downturn in employment, and the slow recovery in hiring that followed, was tracked each month by the [Current Employment Statistics](https://www.bls.gov/ces/) program at the U.S. Bureau of Labor Statistics. - -This file contains the monthly employment total in a variety of job categories from January 2006 through December 2015. The numbers are seasonally adjusted and reported in thousands. The data were downloaded on Nov. 11, 2018, and reformatted for use in this library. - -Totals are included for the [22 "supersectors"](https://download.bls.gov/pub/time.series/ce/ce.supersector) tracked by the BLS. The "nonfarm" total is the category typically used by economists and journalists as a stand-in for the country's employment total. - -A calculated "nonfarm_change" column has been appended with the month-to-month change in that supersector's employment. It is useful for illustrating how to make bar charts that report both negative and positive values. - -## `us-state-capitals.json` - -## `volcano.json` - -Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a topographic map by Ross Ihaka, adapted from [R datasets](https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html). These data should not be regarded as accurate. - -## `weather.json` - -Instructional dataset showing actual and predicted temperature data. - -## `weather.csv` - -Data from [NOAA](http://www.ncdc.noaa.gov/cdo-web/datatools/findstation). Transformed using `/scripts/weather.py`. We synthesized the categorical "weather" field from multiple fields in the original dataset. This data is intended for instructional purposes. - -## `wheat.json` - -In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published [an elegant chart on the price of wheat](http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg). It plots 250 years of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that “never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.” - -## `windvectors.csv` - -Simulated wind patterns over northwestern Europe. - -## `world-110m.json` - -## `zipcodes.csv` - -GeoNames.org From e6caa0ae17898f7c45be672955f8697f70c5fc34 Mon Sep 17 00:00:00 2001 From: ds <63077097+dsmedia@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:52:53 -0500 Subject: [PATCH 14/40] Move and rename SOURCES.toml to _data/datapackage_additions.toml --- SOURCES.toml => _data/datapackage_additions.toml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename SOURCES.toml => _data/datapackage_additions.toml (100%) diff --git a/SOURCES.toml b/_data/datapackage_additions.toml similarity index 100% rename from SOURCES.toml rename to _data/datapackage_additions.toml From 258ccc88141c824a6519739064af094b8fb1885b Mon Sep 17 00:00:00 2001 From: ds <63077097+dsmedia@users.noreply.github.com> Date: Thu, 12 Dec 2024 19:06:55 -0500 Subject: [PATCH 15/40] update filepath of datapackage_additions.toml in build-datapackage.json file had previously been called SOURCES.toml in project dir, now renamed to build_datapackage.json in _data subfolder --- scripts/build_datapackage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index de2dae98..aa557518 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -397,7 +397,7 @@ def main( raise TypeError(msg) repo_dir: Path = Path(__file__).parent.parent data_dir: Path = repo_dir / "data" - sources_toml: Path = repo_dir / "SOURCES.toml" + sources_toml: Path = repo_dir / "_data" / "datapackage_additions.toml" sources = read_toml(sources_toml) # NOTE: Package metadata is expected to be stored in `sources` in the future From dbc1e8d1dee2ef98fd3190087eb333abdc3a29b0 Mon Sep 17 00:00:00 2001 From: Daniel Sorid <63077097+dsmedia@users.noreply.github.com> Date: Fri, 13 Dec 2024 04:40:26 +0000 Subject: [PATCH 16/40] regenerate datapackage.json following change to _data/datapackage_additions.toml --- datapackage.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/datapackage.json b/datapackage.json index 6379224c..265a8e50 100644 --- a/datapackage.json +++ b/datapackage.json @@ -21,7 +21,7 @@ } ], "version": "2.11.0", - "created": "2024-12-11T16:26:59.665471+00:00", + "created": "2024-12-13T04:34:18.194166+00:00", "resources": [ { "name": "7zip.png", @@ -588,7 +588,7 @@ { "name": "burtin.json", "type": "table", - "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) reads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in > red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits > the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional > to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. > fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. > vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to > neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood.\n", + "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in > red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits > the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional > to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. > fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. > vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to > neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood.\n", "sources": [ { "title": "Scope Magazine", @@ -734,7 +734,7 @@ { "name": "countries.json", "type": "table", - "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/\ndocumentation/) notes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", + "description": "This dataset combines key demographic indicators (life expectancy at birth and\nfertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year\nintervals. It includes both current values and adjacent time period values (previous and next)\nfor each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) \nnotes that its philosophy is to fill data gaps with estimates and use current\ngeographic boundaries for historical data. Gapminder states that it aims to \"show people the\nbig picture\" rather than support detailed numeric analysis.", "licenses": [ { "title": "Creative Commons Attribution 4.0 International", @@ -1407,7 +1407,7 @@ { "name": "gapminder.json", "type": "table", - "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n originally added to this collection in 2015. The specific criteria for country selection \n in that version are not known. Data for Aruba are no longer available in the new version. \n Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n preserve continuity with previous versions of this dataset, we have retained the column \n name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, \n 4: east_asia_pacific, 5: middle_east_north_africa`.", + "description": "This dataset combines key demographic indicators (life expectancy at birth, \npopulation, and fertility rate measured as babies per woman) for various countries from 1955 \nto 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable \ngrouping countries. Gapminder's data documentation notes that its philosophy is to fill data \ngaps with estimates and use current geographic boundaries for historical data. Gapminder \nstates that it aims to \"show people the big picture\" rather than support detailed numeric \nanalysis.\n\nNotes:\n1. Country Selection: The set of countries in this file matches the version of this dataset \n originally added to this collection in 2015. The specific criteria for country selection \n in that version are not known. Data for Aruba are no longer available in the new version. \n Hong Kong has been revised to Hong Kong, China in the new version.\n\n2. Data Precision: The precision of float values may have changed from the original version. \n These changes reflect the most recent source data used for each indicator.\n\n3. Regional Groupings: The 'cluster' column represents a regional mapping of countries \n corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To \n preserve continuity with previous versions of this dataset, we have retained the column \n name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: \n `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`.", "sources": [ { "title": "Gapminder Foundation - Life Expectancy (Data)", @@ -1632,7 +1632,7 @@ { "name": "jobs.json", "type": "table", - "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) (person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). The organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", + "description": "U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which \"collects, preserves and harmonizes U.S. census microdata\" from as early as 1790.\n\nOriginally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). \nThe dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/).\n\nData is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions).\n\nIPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating:\n>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared.\n\nThis dataset contains only summary statistics and does not include any underlying microdata records.\n\n1. This dataset represents summary data. The underlying microdata records are not included.\n2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) \n(person weight) variable as an expansion factor when working with IPUMS USA extracts.\n3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly.\n\nWhen using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml).\nThe organization requests use of the following citation for this json file:\n\nSteven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0\n", "sources": [ { "title": "IPUMS USA", @@ -2246,7 +2246,7 @@ { "name": "political-contributions.json", "type": "table", - "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/\ncampaign-finance-data/all-candidates-file-description/). The sample dataset in \n`political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://\n> creativecommons.org/publicdomain/zero/1.0/) dedication. Read more on our license page. \n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", + "description": "Summary financial information on contributions to candidates for U.S. \nelections. An updated version of this datset is available from the \"all candidates\" files \n(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election \nCommission, or, alternatively, via OpenFEC. Information on each of the 25 columns is \navailable from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/).\nThe sample dataset in `political-contributions.json` contains 58 records with dates from 2015.\n\nFEC data is subject to the commission's:\n- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/)\n- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/)\n- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md)\n\nAdditionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states:\n> This project is in the public domain within the United States, and we waive worldwide \n> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/)\n> dedication. Read more on our license page.\n> A few restrictions limit the way you can use FEC data. For example, you can't use \n> contributor lists for commercial purposes or to solicit donations. Learn more on \n> [FEC.gov](https://www.fec.gov/).", "sources": [ { "title": "Federal Election Commission Bulk Data", @@ -2468,7 +2468,7 @@ { "name": "seattle-weather-hourly-normals.csv", "type": "table", - "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/\ndocumentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure \nand updated the format to be easier to parse.", + "description": "Hourly weather normals with metric units. The 1981-2010 Climate Normals are \nNCDC's three-decade averages of climatological variables, including temperature and \nprecipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf).\nWe only included temperature, wind, and pressure \nand updated the format to be easier to parse.", "sources": [ { "title": "NOAA National Climatic Data Center (NCDC)", @@ -2805,7 +2805,7 @@ { "name": "us-employment.csv", "type": "table", - "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/\nce.supersector) tracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", + "description": "In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job \nlosses across the United States. The downturn in employment, and the slow recovery in hiring that \nfollowed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau \nof Labor Statistics.\n\nThis file contains the monthly employment total in a variety of job categories from January 2006 \nthrough December 2015. The numbers are seasonally adjusted and reported in thousands. The data \nwere downloaded on Nov. 11, 2018, and reformatted for use in this library.\n\nTotals are included for the [22 \"supersectors\"](https://download.bls.gov/pub/time.series/ce/ce.supersector)\ntracked by the BLS. The \"nonfarm\" total is the category typically used by \neconomists and journalists as a stand-in for the country's employment total.\n\nA calculated \"nonfarm_change\" column has been appended with the month-to-month change in that \nsupersector's employment. It is useful for illustrating how to make bar charts that report both \nnegative and positive values.\n", "sources": [ { "title": "U.S. Bureau of Labor Statistics Current Employment Statistics", @@ -3029,7 +3029,7 @@ { "name": "wheat.json", "type": "table", - "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/\nwiki/William_Playfair), a Scottish engineer who is often credited as the founder of \nstatistical graphics, published an elegant chart on the price of wheat. It plots 250 years \nof prices alongside weekly wages and the reigning monarch. He intended to demonstrate that \n\"never at any former period was wheat so cheap, in proportion to mechanical labour, as it \nis at the present time.\"", + "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that \"never at any former period \nwas wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"", "sources": [ { "title": "1822 Playfair Chart", From fbd1be573bb3257d5f1e2eb643b1bd97d6acf380 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:11:47 +0000 Subject: [PATCH 17/40] refactor(ruff): Fix warnings Paired with (https://github.com/vega/vega-datasets/pull/645/commits/2b1be705fef15899a51155573ba5c42f4febf2d9) --- scripts/build_datapackage.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index aa557518..9129aff5 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -38,6 +38,7 @@ import json import logging import os +import tomllib import warnings from collections.abc import Mapping, Sequence from functools import partial @@ -296,7 +297,8 @@ def frame_to_schema(frame: pl.LazyFrame | pl.DataFrame, /) -> fl.Schema: def extract_package_metadata(repo_root: Path, /) -> PackageMeta: - """Repurpose `package.json`_ for the `Data Package`_ standard. + """ + Repurpose `package.json`_ for the `Data Package`_ standard. .. _package.json: https://github.com/vega/vega-datasets/blob/main/package.json @@ -332,8 +334,7 @@ def extract_package_metadata(repo_root: Path, /) -> PackageMeta: def extract_overrides(mapping: Mapping[str, Any], /) -> dict[str, ResourceMeta]: if (resources := mapping.get("resources")) and isinstance(resources, Sequence): return dict(iter_parse_resources(resources)) - else: - raise TypeError(resources) + raise TypeError(resources) def iter_parse_resources( @@ -382,8 +383,6 @@ def iter_resources( def read_toml(fp: Path, /) -> Mapping[str, Any]: - import tomllib - return tomllib.loads(fp.read_text("utf-8")) @@ -393,7 +392,7 @@ def main( output_format: Literal["json", "yaml", "both"] = "json", ) -> None: if output_format not in {"json", "yaml", "both"}: - msg = f"Expected one of {["json", "yaml", "both"]!r} but got {output_format!r}" + msg = f"Expected one of {['json', 'yaml', 'both']!r} but got {output_format!r}" raise TypeError(msg) repo_dir: Path = Path(__file__).parent.parent data_dir: Path = repo_dir / "data" @@ -406,18 +405,19 @@ def main( # - Ensures ``frictionless`` doesn't insert platform-specific path separator(s) os.chdir(data_dir) pkg_meta = extract_package_metadata(repo_dir) - logger.info( - f"Collecting resources for '{pkg_meta['name']}@{pkg_meta['version']}' ..." - ) + msg = f"Collecting resources for '{pkg_meta['name']}@{pkg_meta['version']}' ..." + logger.info(msg) pkg = Package(resources=list(iter_resources(data_dir, overrides)), **pkg_meta) # type: ignore[arg-type] - logger.info(f"Collected {len(pkg.resources)} resources") + msg = f"Collected {len(pkg.resources)} resources" + logger.info(msg) if output_format in {"json", "both"}: p = (repo_dir / f"{stem}.json").as_posix() - logger.info(f"Writing {p!r}") + logger.info(msg) pkg.to_json(p) if output_format in {"yaml", "both"}: p = (repo_dir / f"{stem}.yaml").as_posix() - logger.info(f"Writing {p!r}") + msg = f"Writing {p!r}" + logger.info(msg) pkg.to_yaml(p) From e442c41f2a2bf49fb5f24d2ee73b0e6495656275 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:19:05 +0000 Subject: [PATCH 18/40] style: run `taplo fmt` Seems to be a bug with `align_entries=true`, when multiline strings occur https://github.com/tamasfe/taplo/issues/595 --- _data/datapackage_additions.toml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/_data/datapackage_additions.toml b/_data/datapackage_additions.toml index 3f8f3157..4c5f6948 100644 --- a/_data/datapackage_additions.toml +++ b/_data/datapackage_additions.toml @@ -122,7 +122,7 @@ title = "StatLib Datasets Archive" path = "http://lib.stat.cmu.edu/datasets/" [[resources]] # Path: co2-concentration.csv -path = "co2-concentration.csv" +path = "co2-concentration.csv" description = """Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. Only includes rows with valid data.""" @@ -131,7 +131,7 @@ title = "Scripps CO2 Program" path = "https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record" [[resources]] # Path: countries.json -path = "countries.json" +path = "countries.json" description = """This dataset combines key demographic indicators (life expectancy at birth and fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year intervals. It includes both current values and adjacent time period values (previous and next) @@ -288,7 +288,7 @@ title = "U.S. Bureau of Transportation Statistics" path = "https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr" [[resources]] # Path: football.json -path = "football.json" +path = "football.json" description = """Football match outcomes across multiple divisions from 2013 to 2017, part of a larger dataset from OpenFootball. The subset was made such that there are records for all five chosen divisions over the time period.""" @@ -428,7 +428,7 @@ path = "https://data.giss.nasa.gov/gistemp/" path = "income.json" [[resources]] # Path: iowa-electricity.csv -path = "iowa-electricity.csv" +path = "iowa-electricity.csv" description = """The state of Iowa has dramatically increased its production of renewable wind power in recent years. This file contains the annual net generation of electricity in the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. @@ -504,7 +504,7 @@ path = "https://usa.ipums.org/usa/" version = "6.0" [[resources]] # Path: la-riots.csv -path = "la-riots.csv" +path = "la-riots.csv" description = """More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic coordinates of their death. Compiled and published by the Los Angeles Times Data Desk.""" @@ -514,7 +514,7 @@ title = "LA Riots Deaths, Los Angeles Times Data Desk" path = "http://spreadsheets.latimes.com/la-riots-deaths/" [[resources]] # Path: londonBoroughs.json -path = "londonBoroughs.json" +path = "londonBoroughs.json" description = """Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. Original data \"contains National Statistics data © Crown copyright and database right (2015)\" and \"Contains Ordnance Survey data © Crown copyright and database right [2015].""" @@ -596,7 +596,7 @@ title = "The Royal Family - Interregnum" path = "https://www.royal.uk/interregnum-1649-1660" [[resources]] # Path: movies.json -path = "movies.json" +path = "movies.json" description = """The dataset has well known and intentionally included errors. This dataset is provided for instructional purposes, including the need to reckon with dirty data.""" @@ -607,7 +607,7 @@ path = "normal-2d.json" path = "obesity.json" [[resources]] # Path: ohlc.json -path = "ohlc.json" +path = "ohlc.json" description = """This dataset contains the performance of the Chicago Board Options Exchange [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/ %5EVIX#overview)) in the summer of 2009.""" @@ -617,7 +617,7 @@ title = "Yahoo Finance VIX Data" path = "https://finance.yahoo.com/chart/%5EVIX" [[resources]] # Path: penguins.json -path = "penguins.json" +path = "penguins.json" description = """Palmer Archipelago (Antarctica) penguin data collected and made available by [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research @@ -709,7 +709,7 @@ title = "IPUMS USA" path = "https://usa.ipums.org/usa/" [[resources]] # Path: population_engineers_hurricanes.csv -path = "population_engineers_hurricanes.csv" +path = "population_engineers_hurricanes.csv" description = """Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, [Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)""" @@ -724,7 +724,7 @@ title = "NOAA National Climatic Data Center" path = "https://www.ncdc.noaa.gov/cdo-web/datatools/records" [[resources]] # Path: seattle-weather-hourly-normals.csv -path = "seattle-weather-hourly-normals.csv" +path = "seattle-weather-hourly-normals.csv" description = """Hourly weather normals with metric units. The 1981-2010 Climate Normals are NCDC's three-decade averages of climatological variables, including temperature and precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). @@ -736,7 +736,7 @@ title = "NOAA National Climatic Data Center (NCDC)" path = "https://www.ncdc.noaa.gov/cdo-web/datatools/normals" [[resources]] # Path: seattle-weather.csv -path = "seattle-weather.csv" +path = "seattle-weather.csv" description = """Daily weather records with metric units. Transformed using `/scripts/weather.py`. The categorical \"weather\" field is synthesized from multiple fields in the original dataset. This data is intended for instructional purposes.""" @@ -898,7 +898,7 @@ path = "https://www.bls.gov/ces/" path = "us-state-capitals.json" [[resources]] # Path: volcano.json -path = "volcano.json" +path = "volcano.json" description = """Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate.""" @@ -908,7 +908,7 @@ title = "R Datasets" path = "https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html" [[resources]] # Path: weather.csv -path = "weather.csv" +path = "weather.csv" description = """NOAA data transformed using `/scripts/weather.py`. Categorical \"weather\" field synthesized from multiple fields in the original dataset. This data is intended for instructional purposes.""" @@ -921,7 +921,7 @@ path = "weather.json" description = """Instructional dataset showing actual and predicted temperature data.""" [[resources]] # Path: wheat.json -path = "wheat.json" +path = "wheat.json" description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published an elegant chart on the price of wheat. It plots 250 years of prices alongside From 8a00e9108f30a87d147d7e8fac0ede185a5828ff Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 12:20:51 +0000 Subject: [PATCH 19/40] feat: Support multiple sources for package-level metadata - Handles merging of `package.json` and `datapackage_additions.toml` - Moves `[[licenses]]` `[[sources]]` is still outstanding (https://github.com/vega/vega-datasets/pull/643#issuecomment-2540266253) --- _data/datapackage_additions.toml | 5 ++ datapackage.json | 2 +- scripts/build_datapackage.py | 119 ++++++++++++++++++++++++------- 3 files changed, 100 insertions(+), 26 deletions(-) diff --git a/_data/datapackage_additions.toml b/_data/datapackage_additions.toml index 4c5f6948..55269912 100644 --- a/_data/datapackage_additions.toml +++ b/_data/datapackage_additions.toml @@ -1,3 +1,8 @@ +[[licenses]] +name = "BSD-3-Clause" +path = "https://opensource.org/license/bsd-3-clause" +title = "The 3-Clause BSD License" + [[resources]] # Path: 7zip.png path = "7zip.png" description = """Application icons from open-source software projects.""" diff --git a/datapackage.json b/datapackage.json index 265a8e50..f023e8f1 100644 --- a/datapackage.json +++ b/datapackage.json @@ -21,7 +21,7 @@ } ], "version": "2.11.0", - "created": "2024-12-13T04:34:18.194166+00:00", + "created": "2024-12-13T12:17:23.415170+00:00", "resources": [ { "name": "7zip.png", diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index 9129aff5..3bcf699a 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -34,6 +34,7 @@ from __future__ import annotations +import copy import datetime as dt import json import logging @@ -43,7 +44,16 @@ from collections.abc import Mapping, Sequence from functools import partial from pathlib import Path -from typing import TYPE_CHECKING, Any, NotRequired, Required, TypedDict, Unpack, cast +from typing import ( + TYPE_CHECKING, + Any, + LiteralString, + NotRequired, + Required, + TypedDict, + Unpack, + cast, +) import frictionless as fl import polars as pl @@ -97,6 +107,8 @@ | None ) +ADDITIONS_TOML: LiteralString = "datapackage_additions.toml" +NPM_PACKAGE: Literal["package.json"] = "package.json" POLARS_PY_TO_FL_FIELD: Mapping[PythonDataType, type[fl.Field]] = { int: IntegerField, @@ -283,9 +295,9 @@ class PackageMeta(TypedDict): version: str homepage: str description: str - licenses: Sequence[License] + licenses: NotRequired[Sequence[License]] contributors: Sequence[Contributor] - sources: Sequence[Source] + sources: NotRequired[Sequence[Source]] created: str @@ -296,7 +308,7 @@ def frame_to_schema(frame: pl.LazyFrame | pl.DataFrame, /) -> fl.Schema: ) -def extract_package_metadata(repo_root: Path, /) -> PackageMeta: +def _extract_npm_metadata(m: Mapping[str, Any], /) -> PackageMeta: """ Repurpose `package.json`_ for the `Data Package`_ standard. @@ -305,25 +317,12 @@ def extract_package_metadata(repo_root: Path, /) -> PackageMeta: .. _Data Package: https://datapackage.org/standard/data-package/#properties """ - fp: Path = repo_root / "package.json" - with fp.open(encoding="utf-8") as f: - m = json.load(f) - if not isinstance(m, Mapping): - msg = f"Unexpected type returned from {fp!r}\n{type(m).__name__!r}" - raise TypeError(msg) return PackageMeta( name=m["name"], version=m["version"], homepage=m["repository"]["url"], description=m["description"], contributors=[Contributor(title=m["author"]["name"], path=m["author"]["url"])], - licenses=[ - License( - name=m["license"], - path="https://opensource.org/license/bsd-3-clause", - title="The 3-Clause BSD License", - ) - ], sources=[ Source(path="https://github.com/vega/vega-datasets/blob/next/SOURCES.md") ], @@ -331,10 +330,74 @@ def extract_package_metadata(repo_root: Path, /) -> PackageMeta: ) -def extract_overrides(mapping: Mapping[str, Any], /) -> dict[str, ResourceMeta]: - if (resources := mapping.get("resources")) and isinstance(resources, Sequence): +def _merge_package_metadata( + pkg_meta: PackageMeta, additions: Mapping[str, Any], / +) -> PackageMeta: + # defined in frictionless spec + spec_keys = PackageMeta.__optional_keys__.union(PackageMeta.__required_keys__) + + if unknown_keys := set(additions).difference(spec_keys): + msg = ( + f"`additions` contains keys that are out of spec:\n" + f"{sorted(unknown_keys)!r}\n\n" + f"Try updating {PackageMeta.__name__!r} or remove them from {ADDITIONS_TOML!r}" + ) + raise TypeError(msg) + + additions = dict(copy.deepcopy(additions)) + + # relevant from `datapackage_additions.toml` + incoming_keys = spec_keys.intersection(additions) + + # In both `package.json` & `datapackage_additions.toml` + overlapping_keys = incoming_keys.intersection(pkg_meta) + + changes = dict[str, Any](copy.deepcopy(pkg_meta)) + + # Extract and handle colliding content + for k in overlapping_keys: + item = pkg_meta[k] + extra = additions.pop(k) + if type(item) is not type(extra): + msg = ( + f"Mismatched types for overlapping key {k!r}:\n" + f"Current : {type(item).__name__!r}, {item!r}\n" + f"Incoming : {type(extra).__name__!r}, {extra!r}" + ) + raise TypeError(msg) + if isinstance(item, str) or not isinstance(item, Sequence | Mapping): + msg = f"Overriding overlapping key {k!r}\nCurrent : {item!r}\nIncoming : {extra!r}" + logger.warning(msg, stacklevel=2) + changes[k] = extra + elif isinstance(item, Sequence): + changes[k] = [*item, extra] + else: + msg = ( + f"Expected only lists of mappings or single values, " + f"but got:{type(item).__name__!r}\n{item!r}\n\n{extra!r}" + ) + raise NotImplementedError(msg) + + # Remaining are in-spec and only in `datapackage_additions.toml` + changes |= additions + return PackageMeta(**changes) + + +def extract_package_metadata( + npm: Mapping[str, Any], sources: Mapping[str, Any], / +) -> PackageMeta: + pkg_meta = _extract_npm_metadata(npm) + return _merge_package_metadata(pkg_meta, sources) + + +def extract_overrides(resources: Any, /) -> dict[str, ResourceMeta]: + if isinstance(resources, Sequence): return dict(iter_parse_resources(resources)) - raise TypeError(resources) + msg = ( + f"Expected `resources` to be an array of tables, but got:" + f"\n{type(resources).__name__!r}\n\n{resources!r}" + ) + raise TypeError(msg) def iter_parse_resources( @@ -382,10 +445,15 @@ def iter_resources( continue -def read_toml(fp: Path, /) -> Mapping[str, Any]: +def read_toml(fp: Path, /) -> dict[str, Any]: return tomllib.loads(fp.read_text("utf-8")) +def read_json(fp: Path, /) -> Any: + with fp.open(encoding="utf-8") as f: + return json.load(f) + + def main( *, stem: str = "datapackage", @@ -396,15 +464,16 @@ def main( raise TypeError(msg) repo_dir: Path = Path(__file__).parent.parent data_dir: Path = repo_dir / "data" - sources_toml: Path = repo_dir / "_data" / "datapackage_additions.toml" + sources_toml: Path = repo_dir / "_data" / ADDITIONS_TOML + npm_json = repo_dir / NPM_PACKAGE + npm_package = read_json(npm_json) sources = read_toml(sources_toml) - # NOTE: Package metadata is expected to be stored in `sources` in the future - overrides = extract_overrides(sources) + overrides = extract_overrides(sources.pop("resources")) # NOTE: Forcing base directory here # - Ensures ``frictionless`` doesn't insert platform-specific path separator(s) os.chdir(data_dir) - pkg_meta = extract_package_metadata(repo_dir) + pkg_meta = extract_package_metadata(npm_package, sources) msg = f"Collecting resources for '{pkg_meta['name']}@{pkg_meta['version']}' ..." logger.info(msg) pkg = Package(resources=list(iter_resources(data_dir, overrides)), **pkg_meta) # type: ignore[arg-type] From c0b250abf052e20a024ca1f8f8eb28b5a8b38677 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 12:22:45 +0000 Subject: [PATCH 20/40] fix: Removes package-level `[[sources]]` Resolves (https://github.com/vega/vega-datasets/pull/643#issuecomment-2540266253) --- datapackage.json | 7 +------ scripts/build_datapackage.py | 3 --- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/datapackage.json b/datapackage.json index f023e8f1..b5c20bab 100644 --- a/datapackage.json +++ b/datapackage.json @@ -9,11 +9,6 @@ "title": "The 3-Clause BSD License" } ], - "sources": [ - { - "path": "https://github.com/vega/vega-datasets/blob/next/SOURCES.md" - } - ], "contributors": [ { "title": "UW Interactive Data Lab", @@ -21,7 +16,7 @@ } ], "version": "2.11.0", - "created": "2024-12-13T12:17:23.415170+00:00", + "created": "2024-12-13T12:21:52.733563+00:00", "resources": [ { "name": "7zip.png", diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index 3bcf699a..92a5a5aa 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -323,9 +323,6 @@ def _extract_npm_metadata(m: Mapping[str, Any], /) -> PackageMeta: homepage=m["repository"]["url"], description=m["description"], contributors=[Contributor(title=m["author"]["name"], path=m["author"]["url"])], - sources=[ - Source(path="https://github.com/vega/vega-datasets/blob/next/SOURCES.md") - ], created=dt.datetime.now(dt.UTC).isoformat(), ) From b57259c279abec4c701d38bdebfd2b04aba32e39 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 12:57:12 +0000 Subject: [PATCH 21/40] feat(DRAFT): Generate markdown variants of `datapackage` I think `datapackage-tabular.md` is a lot easier to read. The default option is `datapackage.md`. We only need one of these, so the other can be removed after a decision https://github.com/vega/vega-datasets/pull/643#issuecomment-2541230138 --- datapackage-tabular.md | 1078 ++++++++++++++++++++++++++ datapackage.json | 2 +- datapackage.md | 1386 ++++++++++++++++++++++++++++++++++ scripts/build_datapackage.py | 35 +- 4 files changed, 2489 insertions(+), 12 deletions(-) create mode 100644 datapackage-tabular.md create mode 100644 datapackage.md diff --git a/datapackage-tabular.md b/datapackage-tabular.md new file mode 100644 index 00000000..15b8901e --- /dev/null +++ b/datapackage-tabular.md @@ -0,0 +1,1078 @@ +# `vega-datasets`- `description` Common repository for example datasets used by Vega related projects. +- `homepage` http://github.com/vega/vega-datasets.git +- `licenses` + - [1] + - `name` BSD-3-Clause + - `path` https://opensource.org/license/bsd-3-clause + - `title` The 3-Clause BSD License +- `contributors` + - [1] + - `title` UW Interactive Data Lab + - `path` http://idl.cs.washington.edu +- `version` 2.11.0 +- `created` 2024-12-13T12:53:03.887410+00:00 +## `7zip.png` + - `description` Application icons from open-source software projects. + - `path` 7zip.png +## `airports.csv` + - `path` airports.csv + - `schema` + + | name | type | +|:----------|:-------| +| iata | string | +| name | string | +| city | string | +| state | string | +| country | string | +| latitude | number | +| longitude | number | +## `annual-precip.json` + - `description` A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell. + - `path` annual-precip.json +## `anscombe.json` + - `description` Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician. + - `path` anscombe.json + - `schema` + + | name | type | +|:-------|:--------| +| Series | string | +| X | integer | +| Y | number | +## `barley.json` + - `description` The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. + + It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper "Statistical Determination of Barley Varietal Adaption". + + R.A. Fisher's popularized its use in the field of statistics when he included it in his book "The Design of Experiments". + + Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s. + + - `path` barley.json + - `schema` + + | name | type | +|:--------|:--------| +| yield | number | +| variety | string | +| year | integer | +| site | string | +## `birdstrikes.csv` + - `description` Records of reported wildlife strikes received by the U.S. FAA + - `path` birdstrikes.csv + - `schema` + + | name | type | +|:--------------------------|:--------| +| Airport Name | string | +| Aircraft Make Model | string | +| Effect Amount of damage | string | +| Flight Date | date | +| Aircraft Airline Operator | string | +| Origin State | string | +| Phase of flight | string | +| Wildlife Size | string | +| Wildlife Species | string | +| Time of day | string | +| Cost Other | integer | +| Cost Repair | integer | +| Cost Total $ | integer | +| Speed IAS in knots | integer | +## `budget.json` + - `description` Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget. + - `path` budget.json + - `schema` + + | name | type | +|:------------------------|:--------| +| Source Category Code | integer | +| Source category name | string | +| Source subcategory | integer | +| Source subcategory name | string | +| Agency code | integer | +| Agency name | string | +| Bureau code | integer | +| Bureau name | string | +| Account code | integer | +| Account name | string | +| Treasury Agency code | integer | +| On- or off-budget | string | +| 1962 | string | +| 1963 | string | +| 1964 | string | +| 1965 | string | +| 1966 | string | +| 1967 | string | +| 1968 | string | +| 1969 | string | +| 1970 | string | +| 1971 | string | +| 1972 | string | +| 1973 | string | +| 1974 | string | +| 1975 | string | +| 1976 | string | +| TQ | string | +| 1977 | string | +| 1978 | string | +| 1979 | string | +| 1980 | string | +| 1981 | string | +| 1982 | string | +| 1983 | string | +| 1984 | string | +| 1985 | string | +| 1986 | string | +| 1987 | string | +| 1988 | string | +| 1989 | string | +| 1990 | string | +| 1991 | string | +| 1992 | string | +| 1993 | string | +| 1994 | string | +| 1995 | string | +| 1996 | string | +| 1997 | string | +| 1998 | string | +| 1999 | string | +| 2000 | string | +| 2001 | string | +| 2002 | string | +| 2003 | string | +| 2004 | string | +| 2005 | string | +| 2006 | string | +| 2007 | string | +| 2008 | string | +| 2009 | string | +| 2010 | string | +| 2011 | string | +| 2012 | string | +| 2013 | string | +| 2014 | string | +| 2015 | string | +| 2016 | string | +| 2017 | string | +| 2018 | string | +| 2019 | string | +| 2020 | string | +## `budgets.json` + - `path` budgets.json + - `schema` + + | name | type | +|:-------------|:--------| +| budgetYear | integer | +| forecastYear | integer | +| value | number | +## `burtin.json` + - `description` The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. + + The dataset compares the performance of three antibiotics against 16 different bacteria. + + Numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. + + The dataset was featured as an example in the Protovis project, a precursor to D3.js. + + As noted in the Protovis example, "Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin". + + The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. + + The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) + reads as follows: + + > ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin + > + > + > The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in > red and gram- in blue) with their sensitivities to penicillin, and streptomycin. + > + > The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits > the test organism. + > + > High dilutions are toward the periphery; consequently the length of the colored bar is proportional > to the effectiveness. + > + > It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. > fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. > vulgaris, S. schottmuelleri and M. tuberculosis. + > + > Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to > neomycin, although the majority of these are sensitive to neomycin. + > + > It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood. + + - `path` burtin.json + - `schema` + + | name | type | +|:--------------|:-------| +| Bacteria | string | +| Penicillin | number | +| Streptomycin | number | +| Neomycin | number | +| Gram_Staining | string | +| Genus | string | +## `cars.json` + - `description` Collection of car specifications and performance metrics from various automobile manufacturers. + - `path` cars.json + - `schema` + + | name | type | +|:-----------------|:--------| +| Name | string | +| Miles_per_Gallon | integer | +| Cylinders | integer | +| Displacement | number | +| Horsepower | integer | +| Weight_in_lbs | integer | +| Acceleration | number | +| Year | date | +| Origin | string | +## `co2-concentration.csv` + - `description` Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. + Only includes rows with valid data. + - `path` co2-concentration.csv + - `schema` + + | name | type | +|:-------------|:-------| +| Date | date | +| CO2 | number | +| adjusted CO2 | number | +## `countries.json` + - `description` This dataset combines key demographic indicators (life expectancy at birth and + fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year + intervals. It includes both current values and adjacent time period values (previous and next) + for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) + notes that its philosophy is to fill data gaps with estimates and use current + geographic boundaries for historical data. Gapminder states that it aims to "show people the + big picture" rather than support detailed numeric analysis. + - `path` countries.json + - `schema` + + | name | type | description | +|:--------------|:--------|:-------------------------------------------------------------------------| +| _comment | string | | +| year | integer | Years from 1955 to 2000 at 5-year intervals | +| fertility | number | Fertility rate (average number of children per woman) for the given year | +| life_expect | number | Life expectancy in years for the given year | +| n_fertility | number | Fertility rate for the next 5-year interval | +| n_life_expect | number | Life expectancy for the next 5-year interval | +| country | string | Name of the country | +## `crimea.json` + - `path` crimea.json + - `schema` + + | name | type | +|:--------|:--------| +| date | date | +| wounds | integer | +| other | integer | +| disease | integer | +## `disasters.csv` + - `description` Annual number of deaths from disasters. + - `path` disasters.csv + - `schema` + + | name | type | +|:-------|:--------| +| Entity | string | +| Year | integer | +| Deaths | integer | +## `driving.json` + - `path` driving.json + - `schema` + + | name | type | +|:-------|:--------| +| side | string | +| year | integer | +| miles | integer | +| gas | number | +## `earthquakes.json` + - `description` Earthquake data retrieved Feb 6, 2018 + - `path` earthquakes.json +## `ffox.png` + - `description` Application icons from open-source software projects. + - `path` ffox.png +## `flare-dependencies.json` + - `path` flare-dependencies.json + - `schema` + + | name | type | +|:-------|:--------| +| source | integer | +| target | integer | +## `flare.json` + - `path` flare.json + - `schema` + + | name | type | +|:-------|:--------| +| id | integer | +| name | string | +## `flights-10k.json` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-10k.json + - `schema` + + | name | type | +|:------------|:--------| +| date | string | +| delay | integer | +| distance | integer | +| origin | string | +| destination | string | +## `flights-200k.arrow` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-200k.arrow + - `schema` + + | name | type | +|:---------|:--------| +| delay | integer | +| distance | integer | +| time | number | +## `flights-200k.json` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-200k.json + - `schema` + + | name | type | +|:---------|:--------| +| delay | integer | +| distance | integer | +| time | number | +## `flights-20k.json` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-20k.json + - `schema` + + | name | type | +|:------------|:--------| +| date | string | +| delay | integer | +| distance | integer | +| origin | string | +| destination | string | +## `flights-2k.json` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-2k.json + - `schema` + + | name | type | +|:------------|:--------| +| date | string | +| delay | integer | +| distance | integer | +| origin | string | +| destination | string | +## `flights-3m.parquet` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-3m.parquet + - `schema` + + | name | type | +|:------------|:---------| +| date | datetime | +| delay | integer | +| distance | integer | +| origin | string | +| destination | string | +## `flights-5k.json` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-5k.json + - `schema` + + | name | type | +|:------------|:--------| +| date | string | +| delay | integer | +| distance | integer | +| origin | string | +| destination | string | +## `flights-airport.csv` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-airport.csv + - `schema` + + | name | type | +|:------------|:--------| +| origin | string | +| destination | string | +| count | integer | +## `football.json` + - `description` Football match outcomes across multiple divisions from 2013 to 2017, part of a + larger dataset from OpenFootball. The subset was made such that there are records for all five + chosen divisions over the time period. + - `path` football.json + - `schema` + + | name | type | +|:-----------|:--------| +| date | date | +| division | string | +| home_team | string | +| away_team | string | +| home_score | integer | +| away_score | integer | +## `gapminder-health-income.csv` + - `description` Per-capita income, life expectancy, population and regional grouping. Dataset does not specify + the reference year for the data. Gapminder historical data is subject to revisions. + + Gapminder (v30, 2023) defines per-capita income as follows: + >"This is real GDP per capita (gross domestic product per person adjusted for inflation) + >converted to international dollars using purchasing power parity rates. An international dollar + >has the same purchasing power over GDP as the U.S. dollar has in the United States." + + - `path` gapminder-health-income.csv + - `schema` + + | name | type | +|:-----------|:--------| +| country | string | +| income | integer | +| health | number | +| population | integer | +| region | string | +## `gapminder.json` + - `description` This dataset combines key demographic indicators (life expectancy at birth, + population, and fertility rate measured as babies per woman) for various countries from 1955 + to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable + grouping countries. Gapminder's data documentation notes that its philosophy is to fill data + gaps with estimates and use current geographic boundaries for historical data. Gapminder + states that it aims to "show people the big picture" rather than support detailed numeric + analysis. + + Notes: + 1. Country Selection: The set of countries in this file matches the version of this dataset + originally added to this collection in 2015. The specific criteria for country selection + in that version are not known. Data for Aruba are no longer available in the new version. + Hong Kong has been revised to Hong Kong, China in the new version. + + 2. Data Precision: The precision of float values may have changed from the original version. + These changes reflect the most recent source data used for each indicator. + + 3. Regional Groupings: The 'cluster' column represents a regional mapping of countries + corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To + preserve continuity with previous versions of this dataset, we have retained the column + name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: + `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`. + - `path` gapminder.json + - `schema` + + | name | type | description | +|:------------|:--------|:-----------------------------------------------------------------| +| year | integer | Years from 1955 to 2005 at 5-year intervals | +| country | string | Name of the country | +| cluster | integer | A categorical variable (values 0-5) grouping countries by region | +| pop | integer | Population of the country | +| life_expect | number | Life expectancy in years | +| fertility | number | Fertility rate (average number of children per woman | +## `gimp.png` + - `description` Application icons from open-source software projects. + - `path` gimp.png +## `github.csv` + - `description` Generated using `/scripts/github.py`. + - `path` github.csv + - `schema` + + | name | type | +|:-------|:--------| +| time | string | +| count | integer | +## `global-temp.csv` + - `description` Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023. + - `path` global-temp.csv + - `schema` + + | name | type | +|:-------|:--------| +| year | integer | +| temp | number | +## `income.json` + - `path` income.json + - `schema` + + | name | type | +|:-------|:--------| +| name | string | +| region | string | +| id | integer | +| pct | number | +| total | integer | +| group | string | +## `iowa-electricity.csv` + - `description` The state of Iowa has dramatically increased its production of renewable + wind power in recent years. This file contains the annual net generation of electricity in + the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. + It is useful for illustrating stacked area charts. + - `path` iowa-electricity.csv + - `schema` + + | name | type | +|:---------------|:--------| +| year | date | +| source | string | +| net_generation | integer | +## `jobs.json` + - `description` U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census microdata" from as early as 1790. + + Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). + The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). + + Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). + + IPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating: + >We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared. + + This dataset contains only summary statistics and does not include any underlying microdata records. + + 1. This dataset represents summary data. The underlying microdata records are not included. + 2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) + (person weight) variable as an expansion factor when working with IPUMS USA extracts. + 3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly. + + When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). + The organization requests use of the following citation for this json file: + + Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0 + + - `path` jobs.json + - `schema` + + | name | type | description | +|:-------|:--------|:----------------------------------------------| +| job | string | The occupation title | +| sex | string | Sex (men/women) | +| year | integer | Census year | +| count | integer | Number of individuals in the occupation | +| perc | number | Percentage of the workforce in the occupation | +## `la-riots.csv` + - `description` More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles + for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic + coordinates of their death. Compiled and published by the Los Angeles Times Data Desk. + - `path` la-riots.csv + - `schema` + + | name | type | +|:-------------|:--------| +| first_name | string | +| last_name | string | +| age | integer | +| gender | string | +| race | string | +| death_date | date | +| address | string | +| neighborhood | string | +| type | string | +| longitude | number | +| latitude | number | +## `londonboroughs.json` + - `description` Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. + Original data "contains National Statistics data © Crown copyright and database right (2015)" + and "Contains Ordnance Survey data © Crown copyright and database right [2015]. + - `path` londonBoroughs.json +## `londoncentroids.json` + - `description` Calculated from `londongBoroughs.json` using `d3.geoCentroid`. + - `path` londonCentroids.json + - `schema` + + | name | type | +|:-------|:-------| +| name | string | +| cx | number | +| cy | number | +## `londontubelines.json` + - `description` Selected rail lines simplified from source. + - `path` londonTubeLines.json +## `lookup_groups.csv` + - `path` lookup_groups.csv + - `schema` + + | name | type | +|:-------|:--------| +| group | integer | +| person | string | +## `lookup_people.csv` + - `path` lookup_people.csv + - `schema` + + | name | type | +|:-------|:--------| +| name | string | +| age | integer | +| height | integer | +## `miserables.json` + - `path` miserables.json +## `monarchs.json` + - `description` A chronological list of English and British monarchs from Elizabeth I through George IV. + Each entry includes: + + The dataset contains two intentional inaccuracies to maintain compatibility with + the [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization: + 1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558; + 2. the end date for the reign of George IV is shown as 1820, instead of 1830. + These discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization. + The entry "W&M" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, + the official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702. + The `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, + and the period leading to the Restoration. While historically more accurate to call this the "interregnum," the field name of `commonwealth` + from the original dataset is retained for backwards compatibility. + The dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689). + Source data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024). + Content on the site is protected by Crown Copyright. + Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most + Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). + - `path` monarchs.json + - `schema` + + | name | type | description | +|:-------|:--------|:------------------------------------------------------------------------------------------------------------| +| name | string | The ruler's name or identifier (e.g., "W&M" for William and Mary, "Cromwell" for the period of interregnum) | +| start | integer | The year their rule began | +| end | integer | The year their rule ended | +| index | integer | A zero-based sequential number assigned to each entry, representing the chronological order of rulers | +## `movies.json` + - `description` The dataset has well known and intentionally included errors. + This dataset is provided for instructional purposes, including the need to reckon with dirty data. + - `path` movies.json + - `schema` + + | name | type | +|:-----------------------|:--------| +| Title | string | +| US Gross | integer | +| Worldwide Gross | integer | +| US DVD Sales | integer | +| Production Budget | integer | +| Release Date | string | +| MPAA Rating | string | +| Running Time min | integer | +| Distributor | string | +| Source | string | +| Major Genre | string | +| Creative Type | string | +| Director | string | +| Rotten Tomatoes Rating | integer | +| IMDB Rating | number | +| IMDB Votes | integer | +## `normal-2d.json` + - `path` normal-2d.json + - `schema` + + | name | type | +|:-------|:-------| +| u | number | +| v | number | +## `obesity.json` + - `path` obesity.json + - `schema` + + | name | type | +|:-------|:--------| +| id | integer | +| rate | number | +| state | string | +## `ohlc.json` + - `description` This dataset contains the performance of the Chicago Board Options Exchange + [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/ + %5EVIX#overview)) in the summer of 2009. + - `path` ohlc.json + - `schema` + + | name | type | +|:-------|:-------| +| date | date | +| open | number | +| high | number | +| low | number | +| close | number | +| signal | string | +| ret | number | +## `penguins.json` + - `description` Palmer Archipelago (Antarctica) penguin data collected and made available by + [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) + and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research + Network](https://lternet.edu/). + - `path` penguins.json + - `schema` + + | name | type | +|:--------------------|:--------| +| Species | string | +| Island | string | +| Beak Length (mm) | number | +| Beak Depth (mm) | number | +| Flipper Length (mm) | integer | +| Body Mass (g) | integer | +| Sex | string | +## `platformer-terrain.json` + - `description` Assets from the video game Celeste. + - `path` platformer-terrain.json + - `schema` + + | name | type | +|:-----------|:--------| +| x | integer | +| y | integer | +| lumosity | number | +| saturation | integer | +| name | string | +| id | string | +| color | string | +| key | string | +## `points.json` + - `path` points.json + - `schema` + + | name | type | +|:-------|:-------| +| x | number | +| y | number | +## `political-contributions.json` + - `description` Summary financial information on contributions to candidates for U.S. + elections. An updated version of this datset is available from the "all candidates" files + (in pipe-delimited format) on the bulk data download page of the U.S. Federal Election + Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is + available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). + The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. + + FEC data is subject to the commission's: + - [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/) + - [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/) + - [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md) + + Additionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states: + > This project is in the public domain within the United States, and we waive worldwide + > copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/) + > dedication. Read more on our license page. + > A few restrictions limit the way you can use FEC data. For example, you can't use + > contributor lists for commercial purposes or to solicit donations. Learn more on + > [FEC.gov](https://www.fec.gov/). + - `path` political-contributions.json + - `schema` + + | name | type | +|:----------------------------------------------|:--------| +| Candidate_Identification | string | +| Candidate_Name | string | +| Incumbent_Challenger_Status | string | +| Party_Code | integer | +| Party_Affiliation | string | +| Total_Receipts | number | +| Transfers_from_Authorized_Committees | integer | +| Total_Disbursements | number | +| Transfers_to_Authorized_Committees | number | +| Beginning_Cash | number | +| Ending_Cash | number | +| Contributions_from_Candidate | number | +| Loans_from_Candidate | integer | +| Other_Loans | integer | +| Candidate_Loan_Repayments | number | +| Other_Loan_Repayments | integer | +| Debts_Owed_By | number | +| Total_Individual_Contributions | integer | +| Candidate_State | string | +| Candidate_District | integer | +| Contributions_from_Other_Political_Committees | integer | +| Contributions_from_Party_Committees | integer | +| Coverage_End_Date | string | +| Refunds_to_Individuals | integer | +| Refunds_to_Committees | integer | +## `population.json` + - `description` United States population statistics by sex and age group across decades between 1850 and 2000. + The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census + microdata" from as early as 1790. + + IPUMS updates and revises datasets over time, which may result in discrepancies between this + dataset and current IPUMS data. Details on data revisions are available here. + + When using this dataset, please refer to IPUMS USA terms of use. The organization requests the + use of the following citation for this json file: + Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated + Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. + http://doi.org/10.18128/D010.V6.0 + + - `path` population.json + - `schema` + + | name | type | description | +|:-------|:--------|:--------------------------------------------------------------------| +| year | integer | Four-digit year of the survey | +| age | integer | Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+) | +| sex | integer | Sex (1=men, 2=women) | +| people | integer | Number of individuals (IPUMS PERWT) | +## `population_engineers_hurricanes.csv` + - `description` Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, + [Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html) + - `path` population_engineers_hurricanes.csv + - `schema` + + | name | type | +|:-----------|:--------| +| state | string | +| id | integer | +| population | integer | +| engineers | number | +| hurricanes | integer | +## `seattle-weather-hourly-normals.csv` + - `description` Hourly weather normals with metric units. The 1981-2010 Climate Normals are + NCDC's three-decade averages of climatological variables, including temperature and + precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). + We only included temperature, wind, and pressure + and updated the format to be easier to parse. + - `path` seattle-weather-hourly-normals.csv + - `schema` + + | name | type | +|:------------|:---------| +| date | datetime | +| pressure | number | +| temperature | number | +| wind | number | +## `seattle-weather.csv` + - `description` Daily weather records with metric units. Transformed using `/scripts/weather.py`. + The categorical "weather" field is synthesized from multiple fields in the original dataset. + This data is intended for instructional purposes. + - `path` seattle-weather.csv + - `schema` + + | name | type | +|:--------------|:-------| +| date | date | +| precipitation | number | +| temp_max | number | +| temp_min | number | +| wind | number | +| weather | string | +## `sp500-2000.csv` + - `description` S&P 500 index values from 2000 to 2020. + - `path` sp500-2000.csv + - `schema` + + | name | type | +|:---------|:--------| +| date | date | +| open | number | +| high | number | +| low | number | +| close | number | +| adjclose | number | +| volume | integer | +## `sp500.csv` + - `path` sp500.csv + - `schema` + + | name | type | +|:-------|:-------| +| date | string | +| price | number | +## `stocks.csv` + - `path` stocks.csv + - `schema` + + | name | type | +|:-------|:-------| +| symbol | string | +| date | string | +| price | number | +## `udistrict.json` + - `path` udistrict.json + - `schema` + + | name | type | +|:-------|:-------| +| key | string | +| lat | number | +## `unemployment-across-industries.json` + - `description` Industry-level unemployment statistics from the Current Population Survey + (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons + and unemployment rate across 11 private industries, as well as agricultural, government, and + self-employed workers. Covers January 2000 through February 2010. Industry classification + follows format of CPS Table A-31. + + The dataset can be replicated using the BLS API. For more, see the `scripts` folder of this + repository. + + The BLS Web site states: + > "Users of the public API should cite the date that data were accessed or retrieved using + > the API. Users must clearly state that "BLS.gov cannot vouch for the data or analyses + > derived from these data after the data have been retrieved from BLS.gov." The BLS.gov logo + > may not be used by persons who are not BLS employees or on products (including web pages) + > that are not BLS-sponsored." + + See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm). + - `path` unemployment-across-industries.json + - `schema` + + | name | type | description | +|:-------|:---------|:------------------------------------------------------------------| +| series | string | Industry name | +| year | integer | Year (2000-2010) | +| month | integer | Month (1-12) | +| count | integer | Number of unemployed persons (in thousands) | +| rate | number | Unemployment rate (percentage) | +| date | datetime | ISO 8601-formatted date string (e.g., "2000-01-01T08:00:00.000Z") | +## `unemployment.tsv` + - `description` This dataset contains county-level unemployment rates in the United States, with data generally + consistent with levels reported in 2009. The dataset is structured as tab-separated values. + The unemployment rate represents the number of unemployed persons as a percentage of the labor + force. According to the Bureau of Labor Statistics (BLS) glossary: + + Unemployed persons (Current Population Survey) [are] persons aged 16 years and older who had + no employment during the reference week, were available for work, except for temporary + illness, and had made specific efforts to find employment sometime during the 4-week period + ending with the reference week. Persons who were waiting to be recalled to a job from which + they had been laid off need not have been looking for work to be classified as unemployed. + + This dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, + a federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). + The LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions, + states, counties, metropolitan areas, and many cities and towns. + + For the most up-to-date LAUS data: + 1. **Monthly and Annual Data Downloads**: + - Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) + and [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data. + 2. **BLS Public Data API**: + - The BLS provides an API for developers to access various datasets, including LAUS data. + - To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query. + - API documentation and examples are available on the BLS Developers page. + + When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm). + - `path` unemployment.tsv + - `schema` + + | name | type | description | +|:-------|:--------|:----------------------------------------| +| id | integer | The combined state and county FIPS code | +| rate | number | The unemployment rate for the county | +## `uniform-2d.json` + - `path` uniform-2d.json + - `schema` + + | name | type | +|:-------|:-------| +| u | number | +| v | number | +## `us-10m.json` + - `path` us-10m.json +## `us-employment.csv` + - `description` In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job + losses across the United States. The downturn in employment, and the slow recovery in hiring that + followed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau + of Labor Statistics. + + This file contains the monthly employment total in a variety of job categories from January 2006 + through December 2015. The numbers are seasonally adjusted and reported in thousands. The data + were downloaded on Nov. 11, 2018, and reformatted for use in this library. + + Totals are included for the [22 "supersectors"](https://download.bls.gov/pub/time.series/ce/ce.supersector) + tracked by the BLS. The "nonfarm" total is the category typically used by + economists and journalists as a stand-in for the country's employment total. + + A calculated "nonfarm_change" column has been appended with the month-to-month change in that + supersector's employment. It is useful for illustrating how to make bar charts that report both + negative and positive values. + + - `path` us-employment.csv + - `schema` + + | name | type | +|:-----------------------------------|:--------| +| month | date | +| nonfarm | integer | +| private | integer | +| goods_producing | integer | +| service_providing | integer | +| private_service_providing | integer | +| mining_and_logging | integer | +| construction | integer | +| manufacturing | integer | +| durable_goods | integer | +| nondurable_goods | integer | +| trade_transportation_utilties | integer | +| wholesale_trade | number | +| retail_trade | number | +| transportation_and_warehousing | number | +| utilities | number | +| information | integer | +| financial_activities | integer | +| professional_and_business_services | integer | +| education_and_health_services | integer | +| leisure_and_hospitality | integer | +| other_services | integer | +| government | integer | +| nonfarm_change | integer | +## `us-state-capitals.json` + - `path` us-state-capitals.json + - `schema` + + | name | type | +|:-------|:-------| +| lon | number | +| lat | number | +| state | string | +| city | string | +## `volcano.json` + - `description` Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. + This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a + topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate. + - `path` volcano.json +## `weather.csv` + - `description` NOAA data transformed using `/scripts/weather.py`. Categorical "weather" field synthesized + from multiple fields in the original dataset. This data is intended for instructional purposes. + - `path` weather.csv + - `schema` + + | name | type | +|:--------------|:-------| +| location | string | +| date | date | +| precipitation | number | +| temp_max | number | +| temp_min | number | +| wind | number | +| weather | string | +## `weather.json` + - `description` Instructional dataset showing actual and predicted temperature data. + - `path` weather.json +## `wheat.json` + - `description` In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), + a Scottish engineer who is often credited as the founder of statistical graphics, + published an elegant chart on the price of wheat. It plots 250 years of prices alongside + weekly wages and the reigning monarch. He intended to demonstrate that "never at any former period + was wheat so cheap, in proportion to mechanical labour, as it is at the present time." + - `path` wheat.json + - `schema` + + | name | type | +|:-------|:--------| +| year | integer | +| wheat | number | +| wages | number | +## `windvectors.csv` + - `description` Simulated wind patterns over northwestern Europe. + - `path` windvectors.csv + - `schema` + + | name | type | +|:----------|:--------| +| longitude | number | +| latitude | number | +| dir | integer | +| dirCat | integer | +| speed | number | +## `world-110m.json` + - `path` world-110m.json +## `zipcodes.csv` + - `description` GeoNames.org + - `path` zipcodes.csv + - `schema` + + | name | type | +|:----------|:--------| +| zip_code | integer | +| latitude | number | +| longitude | number | +| city | string | +| state | string | +| county | string | \ No newline at end of file diff --git a/datapackage.json b/datapackage.json index b5c20bab..7db7733b 100644 --- a/datapackage.json +++ b/datapackage.json @@ -16,7 +16,7 @@ } ], "version": "2.11.0", - "created": "2024-12-13T12:21:52.733563+00:00", + "created": "2024-12-13T12:53:03.887410+00:00", "resources": [ { "name": "7zip.png", diff --git a/datapackage.md b/datapackage.md new file mode 100644 index 00000000..7cb8f531 --- /dev/null +++ b/datapackage.md @@ -0,0 +1,1386 @@ +# `vega-datasets`- `description` Common repository for example datasets used by Vega related projects. +- `homepage` http://github.com/vega/vega-datasets.git +- `licenses` + - [1] + - `name` BSD-3-Clause + - `path` https://opensource.org/license/bsd-3-clause + - `title` The 3-Clause BSD License +- `contributors` + - [1] + - `title` UW Interactive Data Lab + - `path` http://idl.cs.washington.edu +- `version` 2.11.0 +- `created` 2024-12-13T12:53:03.887410+00:00 +## `7zip.png` + - `description` Application icons from open-source software projects. + - `path` 7zip.png +## `airports.csv` + - `path` airports.csv + - `schema` + +### `iata` + - `type` string +### `name` + - `type` string +### `city` + - `type` string +### `state` + - `type` string +### `country` + - `type` string +### `latitude` + - `type` number +### `longitude` + - `type` number +## `annual-precip.json` + - `description` A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell. + - `path` annual-precip.json +## `anscombe.json` + - `description` Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician. + - `path` anscombe.json + - `schema` + +### `Series` + - `type` string +### `X` + - `type` integer +### `Y` + - `type` number +## `barley.json` + - `description` The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. + + It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper "Statistical Determination of Barley Varietal Adaption". + + R.A. Fisher's popularized its use in the field of statistics when he included it in his book "The Design of Experiments". + + Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s. + + - `path` barley.json + - `schema` + +### `yield` + - `type` number +### `variety` + - `type` string +### `year` + - `type` integer +### `site` + - `type` string +## `birdstrikes.csv` + - `description` Records of reported wildlife strikes received by the U.S. FAA + - `path` birdstrikes.csv + - `schema` + +### `Airport Name` + - `type` string +### `Aircraft Make Model` + - `type` string +### `Effect Amount of damage` + - `type` string +### `Flight Date` + - `type` date +### `Aircraft Airline Operator` + - `type` string +### `Origin State` + - `type` string +### `Phase of flight` + - `type` string +### `Wildlife Size` + - `type` string +### `Wildlife Species` + - `type` string +### `Time of day` + - `type` string +### `Cost Other` + - `type` integer +### `Cost Repair` + - `type` integer +### `Cost Total $` + - `type` integer +### `Speed IAS in knots` + - `type` integer +## `budget.json` + - `description` Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget. + - `path` budget.json + - `schema` + +### `Source Category Code` + - `type` integer +### `Source category name` + - `type` string +### `Source subcategory` + - `type` integer +### `Source subcategory name` + - `type` string +### `Agency code` + - `type` integer +### `Agency name` + - `type` string +### `Bureau code` + - `type` integer +### `Bureau name` + - `type` string +### `Account code` + - `type` integer +### `Account name` + - `type` string +### `Treasury Agency code` + - `type` integer +### `On- or off-budget` + - `type` string +### `1962` + - `type` string +### `1963` + - `type` string +### `1964` + - `type` string +### `1965` + - `type` string +### `1966` + - `type` string +### `1967` + - `type` string +### `1968` + - `type` string +### `1969` + - `type` string +### `1970` + - `type` string +### `1971` + - `type` string +### `1972` + - `type` string +### `1973` + - `type` string +### `1974` + - `type` string +### `1975` + - `type` string +### `1976` + - `type` string +### `TQ` + - `type` string +### `1977` + - `type` string +### `1978` + - `type` string +### `1979` + - `type` string +### `1980` + - `type` string +### `1981` + - `type` string +### `1982` + - `type` string +### `1983` + - `type` string +### `1984` + - `type` string +### `1985` + - `type` string +### `1986` + - `type` string +### `1987` + - `type` string +### `1988` + - `type` string +### `1989` + - `type` string +### `1990` + - `type` string +### `1991` + - `type` string +### `1992` + - `type` string +### `1993` + - `type` string +### `1994` + - `type` string +### `1995` + - `type` string +### `1996` + - `type` string +### `1997` + - `type` string +### `1998` + - `type` string +### `1999` + - `type` string +### `2000` + - `type` string +### `2001` + - `type` string +### `2002` + - `type` string +### `2003` + - `type` string +### `2004` + - `type` string +### `2005` + - `type` string +### `2006` + - `type` string +### `2007` + - `type` string +### `2008` + - `type` string +### `2009` + - `type` string +### `2010` + - `type` string +### `2011` + - `type` string +### `2012` + - `type` string +### `2013` + - `type` string +### `2014` + - `type` string +### `2015` + - `type` string +### `2016` + - `type` string +### `2017` + - `type` string +### `2018` + - `type` string +### `2019` + - `type` string +### `2020` + - `type` string +## `budgets.json` + - `path` budgets.json + - `schema` + +### `budgetYear` + - `type` integer +### `forecastYear` + - `type` integer +### `value` + - `type` number +## `burtin.json` + - `description` The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. + + The dataset compares the performance of three antibiotics against 16 different bacteria. + + Numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. + + The dataset was featured as an example in the Protovis project, a precursor to D3.js. + + As noted in the Protovis example, "Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin". + + The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. + + The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) + reads as follows: + + > ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin + > + > + > The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in > red and gram- in blue) with their sensitivities to penicillin, and streptomycin. + > + > The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits > the test organism. + > + > High dilutions are toward the periphery; consequently the length of the colored bar is proportional > to the effectiveness. + > + > It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. > fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. > vulgaris, S. schottmuelleri and M. tuberculosis. + > + > Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to > neomycin, although the majority of these are sensitive to neomycin. + > + > It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood. + + - `path` burtin.json + - `schema` + +### `Bacteria` + - `type` string +### `Penicillin` + - `type` number +### `Streptomycin` + - `type` number +### `Neomycin` + - `type` number +### `Gram_Staining` + - `type` string +### `Genus` + - `type` string +## `cars.json` + - `description` Collection of car specifications and performance metrics from various automobile manufacturers. + - `path` cars.json + - `schema` + +### `Name` + - `type` string +### `Miles_per_Gallon` + - `type` integer +### `Cylinders` + - `type` integer +### `Displacement` + - `type` number +### `Horsepower` + - `type` integer +### `Weight_in_lbs` + - `type` integer +### `Acceleration` + - `type` number +### `Year` + - `type` date +### `Origin` + - `type` string +## `co2-concentration.csv` + - `description` Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. + Only includes rows with valid data. + - `path` co2-concentration.csv + - `schema` + +### `Date` + - `type` date +### `CO2` + - `type` number +### `adjusted CO2` + - `type` number +## `countries.json` + - `description` This dataset combines key demographic indicators (life expectancy at birth and + fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year + intervals. It includes both current values and adjacent time period values (previous and next) + for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) + notes that its philosophy is to fill data gaps with estimates and use current + geographic boundaries for historical data. Gapminder states that it aims to "show people the + big picture" rather than support detailed numeric analysis. + - `path` countries.json + - `schema` + +### `_comment` + - `type` string +### `year` + - `description` Years from 1955 to 2000 at 5-year intervals + - `type` integer +### `fertility` + - `description` Fertility rate (average number of children per woman) for the given year + - `type` number +### `life_expect` + - `description` Life expectancy in years for the given year + - `type` number +### `n_fertility` + - `description` Fertility rate for the next 5-year interval + - `type` number +### `n_life_expect` + - `description` Life expectancy for the next 5-year interval + - `type` number +### `country` + - `description` Name of the country + - `type` string +## `crimea.json` + - `path` crimea.json + - `schema` + +### `date` + - `type` date +### `wounds` + - `type` integer +### `other` + - `type` integer +### `disease` + - `type` integer +## `disasters.csv` + - `description` Annual number of deaths from disasters. + - `path` disasters.csv + - `schema` + +### `Entity` + - `type` string +### `Year` + - `type` integer +### `Deaths` + - `type` integer +## `driving.json` + - `path` driving.json + - `schema` + +### `side` + - `type` string +### `year` + - `type` integer +### `miles` + - `type` integer +### `gas` + - `type` number +## `earthquakes.json` + - `description` Earthquake data retrieved Feb 6, 2018 + - `path` earthquakes.json +## `ffox.png` + - `description` Application icons from open-source software projects. + - `path` ffox.png +## `flare-dependencies.json` + - `path` flare-dependencies.json + - `schema` + +### `source` + - `type` integer +### `target` + - `type` integer +## `flare.json` + - `path` flare.json + - `schema` + +### `id` + - `type` integer +### `name` + - `type` string +## `flights-10k.json` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-10k.json + - `schema` + +### `date` + - `type` string +### `delay` + - `type` integer +### `distance` + - `type` integer +### `origin` + - `type` string +### `destination` + - `type` string +## `flights-200k.arrow` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-200k.arrow + - `schema` + +### `delay` + - `type` integer +### `distance` + - `type` integer +### `time` + - `type` number +## `flights-200k.json` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-200k.json + - `schema` + +### `delay` + - `type` integer +### `distance` + - `type` integer +### `time` + - `type` number +## `flights-20k.json` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-20k.json + - `schema` + +### `date` + - `type` string +### `delay` + - `type` integer +### `distance` + - `type` integer +### `origin` + - `type` string +### `destination` + - `type` string +## `flights-2k.json` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-2k.json + - `schema` + +### `date` + - `type` string +### `delay` + - `type` integer +### `distance` + - `type` integer +### `origin` + - `type` string +### `destination` + - `type` string +## `flights-3m.parquet` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-3m.parquet + - `schema` + +### `date` + - `type` datetime +### `delay` + - `type` integer +### `distance` + - `type` integer +### `origin` + - `type` string +### `destination` + - `type` string +## `flights-5k.json` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-5k.json + - `schema` + +### `date` + - `type` string +### `delay` + - `type` integer +### `distance` + - `type` integer +### `origin` + - `type` string +### `destination` + - `type` string +## `flights-airport.csv` + - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` + - `path` flights-airport.csv + - `schema` + +### `origin` + - `type` string +### `destination` + - `type` string +### `count` + - `type` integer +## `football.json` + - `description` Football match outcomes across multiple divisions from 2013 to 2017, part of a + larger dataset from OpenFootball. The subset was made such that there are records for all five + chosen divisions over the time period. + - `path` football.json + - `schema` + +### `date` + - `type` date +### `division` + - `type` string +### `home_team` + - `type` string +### `away_team` + - `type` string +### `home_score` + - `type` integer +### `away_score` + - `type` integer +## `gapminder-health-income.csv` + - `description` Per-capita income, life expectancy, population and regional grouping. Dataset does not specify + the reference year for the data. Gapminder historical data is subject to revisions. + + Gapminder (v30, 2023) defines per-capita income as follows: + >"This is real GDP per capita (gross domestic product per person adjusted for inflation) + >converted to international dollars using purchasing power parity rates. An international dollar + >has the same purchasing power over GDP as the U.S. dollar has in the United States." + + - `path` gapminder-health-income.csv + - `schema` + +### `country` + - `type` string +### `income` + - `type` integer +### `health` + - `type` number +### `population` + - `type` integer +### `region` + - `type` string +## `gapminder.json` + - `description` This dataset combines key demographic indicators (life expectancy at birth, + population, and fertility rate measured as babies per woman) for various countries from 1955 + to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable + grouping countries. Gapminder's data documentation notes that its philosophy is to fill data + gaps with estimates and use current geographic boundaries for historical data. Gapminder + states that it aims to "show people the big picture" rather than support detailed numeric + analysis. + + Notes: + 1. Country Selection: The set of countries in this file matches the version of this dataset + originally added to this collection in 2015. The specific criteria for country selection + in that version are not known. Data for Aruba are no longer available in the new version. + Hong Kong has been revised to Hong Kong, China in the new version. + + 2. Data Precision: The precision of float values may have changed from the original version. + These changes reflect the most recent source data used for each indicator. + + 3. Regional Groupings: The 'cluster' column represents a regional mapping of countries + corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To + preserve continuity with previous versions of this dataset, we have retained the column + name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: + `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`. + - `path` gapminder.json + - `schema` + +### `year` + - `description` Years from 1955 to 2005 at 5-year intervals + - `type` integer +### `country` + - `description` Name of the country + - `type` string +### `cluster` + - `description` A categorical variable (values 0-5) grouping countries by region + - `type` integer +### `pop` + - `description` Population of the country + - `type` integer +### `life_expect` + - `description` Life expectancy in years + - `type` number +### `fertility` + - `description` Fertility rate (average number of children per woman + - `type` number +## `gimp.png` + - `description` Application icons from open-source software projects. + - `path` gimp.png +## `github.csv` + - `description` Generated using `/scripts/github.py`. + - `path` github.csv + - `schema` + +### `time` + - `type` string +### `count` + - `type` integer +## `global-temp.csv` + - `description` Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023. + - `path` global-temp.csv + - `schema` + +### `year` + - `type` integer +### `temp` + - `type` number +## `income.json` + - `path` income.json + - `schema` + +### `name` + - `type` string +### `region` + - `type` string +### `id` + - `type` integer +### `pct` + - `type` number +### `total` + - `type` integer +### `group` + - `type` string +## `iowa-electricity.csv` + - `description` The state of Iowa has dramatically increased its production of renewable + wind power in recent years. This file contains the annual net generation of electricity in + the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. + It is useful for illustrating stacked area charts. + - `path` iowa-electricity.csv + - `schema` + +### `year` + - `type` date +### `source` + - `type` string +### `net_generation` + - `type` integer +## `jobs.json` + - `description` U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census microdata" from as early as 1790. + + Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). + The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). + + Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). + + IPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating: + >We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared. + + This dataset contains only summary statistics and does not include any underlying microdata records. + + 1. This dataset represents summary data. The underlying microdata records are not included. + 2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) + (person weight) variable as an expansion factor when working with IPUMS USA extracts. + 3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly. + + When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). + The organization requests use of the following citation for this json file: + + Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0 + + - `path` jobs.json + - `schema` + +### `job` + - `description` The occupation title + - `type` string +### `sex` + - `description` Sex (men/women) + - `type` string +### `year` + - `description` Census year + - `type` integer +### `count` + - `description` Number of individuals in the occupation + - `type` integer +### `perc` + - `description` Percentage of the workforce in the occupation + - `type` number +## `la-riots.csv` + - `description` More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles + for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic + coordinates of their death. Compiled and published by the Los Angeles Times Data Desk. + - `path` la-riots.csv + - `schema` + +### `first_name` + - `type` string +### `last_name` + - `type` string +### `age` + - `type` integer +### `gender` + - `type` string +### `race` + - `type` string +### `death_date` + - `type` date +### `address` + - `type` string +### `neighborhood` + - `type` string +### `type` + - `type` string +### `longitude` + - `type` number +### `latitude` + - `type` number +## `londonboroughs.json` + - `description` Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. + Original data "contains National Statistics data © Crown copyright and database right (2015)" + and "Contains Ordnance Survey data © Crown copyright and database right [2015]. + - `path` londonBoroughs.json +## `londoncentroids.json` + - `description` Calculated from `londongBoroughs.json` using `d3.geoCentroid`. + - `path` londonCentroids.json + - `schema` + +### `name` + - `type` string +### `cx` + - `type` number +### `cy` + - `type` number +## `londontubelines.json` + - `description` Selected rail lines simplified from source. + - `path` londonTubeLines.json +## `lookup_groups.csv` + - `path` lookup_groups.csv + - `schema` + +### `group` + - `type` integer +### `person` + - `type` string +## `lookup_people.csv` + - `path` lookup_people.csv + - `schema` + +### `name` + - `type` string +### `age` + - `type` integer +### `height` + - `type` integer +## `miserables.json` + - `path` miserables.json +## `monarchs.json` + - `description` A chronological list of English and British monarchs from Elizabeth I through George IV. + Each entry includes: + + The dataset contains two intentional inaccuracies to maintain compatibility with + the [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization: + 1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558; + 2. the end date for the reign of George IV is shown as 1820, instead of 1830. + These discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization. + The entry "W&M" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, + the official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702. + The `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, + and the period leading to the Restoration. While historically more accurate to call this the "interregnum," the field name of `commonwealth` + from the original dataset is retained for backwards compatibility. + The dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689). + Source data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024). + Content on the site is protected by Crown Copyright. + Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most + Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). + - `path` monarchs.json + - `schema` + +### `name` + - `description` The ruler's name or identifier (e.g., "W&M" for William and Mary, "Cromwell" for the period of interregnum) + - `type` string +### `start` + - `description` The year their rule began + - `type` integer +### `end` + - `description` The year their rule ended + - `type` integer +### `index` + - `description` A zero-based sequential number assigned to each entry, representing the chronological order of rulers + - `type` integer +## `movies.json` + - `description` The dataset has well known and intentionally included errors. + This dataset is provided for instructional purposes, including the need to reckon with dirty data. + - `path` movies.json + - `schema` + +### `Title` + - `type` string +### `US Gross` + - `type` integer +### `Worldwide Gross` + - `type` integer +### `US DVD Sales` + - `type` integer +### `Production Budget` + - `type` integer +### `Release Date` + - `type` string +### `MPAA Rating` + - `type` string +### `Running Time min` + - `type` integer +### `Distributor` + - `type` string +### `Source` + - `type` string +### `Major Genre` + - `type` string +### `Creative Type` + - `type` string +### `Director` + - `type` string +### `Rotten Tomatoes Rating` + - `type` integer +### `IMDB Rating` + - `type` number +### `IMDB Votes` + - `type` integer +## `normal-2d.json` + - `path` normal-2d.json + - `schema` + +### `u` + - `type` number +### `v` + - `type` number +## `obesity.json` + - `path` obesity.json + - `schema` + +### `id` + - `type` integer +### `rate` + - `type` number +### `state` + - `type` string +## `ohlc.json` + - `description` This dataset contains the performance of the Chicago Board Options Exchange + [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/ + %5EVIX#overview)) in the summer of 2009. + - `path` ohlc.json + - `schema` + +### `date` + - `type` date +### `open` + - `type` number +### `high` + - `type` number +### `low` + - `type` number +### `close` + - `type` number +### `signal` + - `type` string +### `ret` + - `type` number +## `penguins.json` + - `description` Palmer Archipelago (Antarctica) penguin data collected and made available by + [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) + and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research + Network](https://lternet.edu/). + - `path` penguins.json + - `schema` + +### `Species` + - `type` string +### `Island` + - `type` string +### `Beak Length (mm)` + - `type` number +### `Beak Depth (mm)` + - `type` number +### `Flipper Length (mm)` + - `type` integer +### `Body Mass (g)` + - `type` integer +### `Sex` + - `type` string +## `platformer-terrain.json` + - `description` Assets from the video game Celeste. + - `path` platformer-terrain.json + - `schema` + +### `x` + - `type` integer +### `y` + - `type` integer +### `lumosity` + - `type` number +### `saturation` + - `type` integer +### `name` + - `type` string +### `id` + - `type` string +### `color` + - `type` string +### `key` + - `type` string +## `points.json` + - `path` points.json + - `schema` + +### `x` + - `type` number +### `y` + - `type` number +## `political-contributions.json` + - `description` Summary financial information on contributions to candidates for U.S. + elections. An updated version of this datset is available from the "all candidates" files + (in pipe-delimited format) on the bulk data download page of the U.S. Federal Election + Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is + available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). + The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. + + FEC data is subject to the commission's: + - [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/) + - [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/) + - [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md) + + Additionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states: + > This project is in the public domain within the United States, and we waive worldwide + > copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/) + > dedication. Read more on our license page. + > A few restrictions limit the way you can use FEC data. For example, you can't use + > contributor lists for commercial purposes or to solicit donations. Learn more on + > [FEC.gov](https://www.fec.gov/). + - `path` political-contributions.json + - `schema` + +### `Candidate_Identification` + - `type` string +### `Candidate_Name` + - `type` string +### `Incumbent_Challenger_Status` + - `type` string +### `Party_Code` + - `type` integer +### `Party_Affiliation` + - `type` string +### `Total_Receipts` + - `type` number +### `Transfers_from_Authorized_Committees` + - `type` integer +### `Total_Disbursements` + - `type` number +### `Transfers_to_Authorized_Committees` + - `type` number +### `Beginning_Cash` + - `type` number +### `Ending_Cash` + - `type` number +### `Contributions_from_Candidate` + - `type` number +### `Loans_from_Candidate` + - `type` integer +### `Other_Loans` + - `type` integer +### `Candidate_Loan_Repayments` + - `type` number +### `Other_Loan_Repayments` + - `type` integer +### `Debts_Owed_By` + - `type` number +### `Total_Individual_Contributions` + - `type` integer +### `Candidate_State` + - `type` string +### `Candidate_District` + - `type` integer +### `Contributions_from_Other_Political_Committees` + - `type` integer +### `Contributions_from_Party_Committees` + - `type` integer +### `Coverage_End_Date` + - `type` string +### `Refunds_to_Individuals` + - `type` integer +### `Refunds_to_Committees` + - `type` integer +## `population.json` + - `description` United States population statistics by sex and age group across decades between 1850 and 2000. + The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census + microdata" from as early as 1790. + + IPUMS updates and revises datasets over time, which may result in discrepancies between this + dataset and current IPUMS data. Details on data revisions are available here. + + When using this dataset, please refer to IPUMS USA terms of use. The organization requests the + use of the following citation for this json file: + Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated + Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. + http://doi.org/10.18128/D010.V6.0 + + - `path` population.json + - `schema` + +### `year` + - `description` Four-digit year of the survey + - `type` integer +### `age` + - `description` Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+) + - `type` integer +### `sex` + - `description` Sex (1=men, 2=women) + - `type` integer +### `people` + - `description` Number of individuals (IPUMS PERWT) + - `type` integer +## `population_engineers_hurricanes.csv` + - `description` Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, + [Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html) + - `path` population_engineers_hurricanes.csv + - `schema` + +### `state` + - `type` string +### `id` + - `type` integer +### `population` + - `type` integer +### `engineers` + - `type` number +### `hurricanes` + - `type` integer +## `seattle-weather-hourly-normals.csv` + - `description` Hourly weather normals with metric units. The 1981-2010 Climate Normals are + NCDC's three-decade averages of climatological variables, including temperature and + precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). + We only included temperature, wind, and pressure + and updated the format to be easier to parse. + - `path` seattle-weather-hourly-normals.csv + - `schema` + +### `date` + - `type` datetime +### `pressure` + - `type` number +### `temperature` + - `type` number +### `wind` + - `type` number +## `seattle-weather.csv` + - `description` Daily weather records with metric units. Transformed using `/scripts/weather.py`. + The categorical "weather" field is synthesized from multiple fields in the original dataset. + This data is intended for instructional purposes. + - `path` seattle-weather.csv + - `schema` + +### `date` + - `type` date +### `precipitation` + - `type` number +### `temp_max` + - `type` number +### `temp_min` + - `type` number +### `wind` + - `type` number +### `weather` + - `type` string +## `sp500-2000.csv` + - `description` S&P 500 index values from 2000 to 2020. + - `path` sp500-2000.csv + - `schema` + +### `date` + - `type` date +### `open` + - `type` number +### `high` + - `type` number +### `low` + - `type` number +### `close` + - `type` number +### `adjclose` + - `type` number +### `volume` + - `type` integer +## `sp500.csv` + - `path` sp500.csv + - `schema` + +### `date` + - `type` string +### `price` + - `type` number +## `stocks.csv` + - `path` stocks.csv + - `schema` + +### `symbol` + - `type` string +### `date` + - `type` string +### `price` + - `type` number +## `udistrict.json` + - `path` udistrict.json + - `schema` + +### `key` + - `type` string +### `lat` + - `type` number +## `unemployment-across-industries.json` + - `description` Industry-level unemployment statistics from the Current Population Survey + (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons + and unemployment rate across 11 private industries, as well as agricultural, government, and + self-employed workers. Covers January 2000 through February 2010. Industry classification + follows format of CPS Table A-31. + + The dataset can be replicated using the BLS API. For more, see the `scripts` folder of this + repository. + + The BLS Web site states: + > "Users of the public API should cite the date that data were accessed or retrieved using + > the API. Users must clearly state that "BLS.gov cannot vouch for the data or analyses + > derived from these data after the data have been retrieved from BLS.gov." The BLS.gov logo + > may not be used by persons who are not BLS employees or on products (including web pages) + > that are not BLS-sponsored." + + See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm). + - `path` unemployment-across-industries.json + - `schema` + +### `series` + - `description` Industry name + - `type` string +### `year` + - `description` Year (2000-2010) + - `type` integer +### `month` + - `description` Month (1-12) + - `type` integer +### `count` + - `description` Number of unemployed persons (in thousands) + - `type` integer +### `rate` + - `description` Unemployment rate (percentage) + - `type` number +### `date` + - `description` ISO 8601-formatted date string (e.g., "2000-01-01T08:00:00.000Z") + - `type` datetime +## `unemployment.tsv` + - `description` This dataset contains county-level unemployment rates in the United States, with data generally + consistent with levels reported in 2009. The dataset is structured as tab-separated values. + The unemployment rate represents the number of unemployed persons as a percentage of the labor + force. According to the Bureau of Labor Statistics (BLS) glossary: + + Unemployed persons (Current Population Survey) [are] persons aged 16 years and older who had + no employment during the reference week, were available for work, except for temporary + illness, and had made specific efforts to find employment sometime during the 4-week period + ending with the reference week. Persons who were waiting to be recalled to a job from which + they had been laid off need not have been looking for work to be classified as unemployed. + + This dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, + a federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). + The LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions, + states, counties, metropolitan areas, and many cities and towns. + + For the most up-to-date LAUS data: + 1. **Monthly and Annual Data Downloads**: + - Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) + and [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data. + 2. **BLS Public Data API**: + - The BLS provides an API for developers to access various datasets, including LAUS data. + - To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query. + - API documentation and examples are available on the BLS Developers page. + + When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm). + - `path` unemployment.tsv + - `schema` + +### `id` + - `description` The combined state and county FIPS code + - `type` integer +### `rate` + - `description` The unemployment rate for the county + - `type` number +## `uniform-2d.json` + - `path` uniform-2d.json + - `schema` + +### `u` + - `type` number +### `v` + - `type` number +## `us-10m.json` + - `path` us-10m.json +## `us-employment.csv` + - `description` In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job + losses across the United States. The downturn in employment, and the slow recovery in hiring that + followed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau + of Labor Statistics. + + This file contains the monthly employment total in a variety of job categories from January 2006 + through December 2015. The numbers are seasonally adjusted and reported in thousands. The data + were downloaded on Nov. 11, 2018, and reformatted for use in this library. + + Totals are included for the [22 "supersectors"](https://download.bls.gov/pub/time.series/ce/ce.supersector) + tracked by the BLS. The "nonfarm" total is the category typically used by + economists and journalists as a stand-in for the country's employment total. + + A calculated "nonfarm_change" column has been appended with the month-to-month change in that + supersector's employment. It is useful for illustrating how to make bar charts that report both + negative and positive values. + + - `path` us-employment.csv + - `schema` + +### `month` + - `type` date +### `nonfarm` + - `type` integer +### `private` + - `type` integer +### `goods_producing` + - `type` integer +### `service_providing` + - `type` integer +### `private_service_providing` + - `type` integer +### `mining_and_logging` + - `type` integer +### `construction` + - `type` integer +### `manufacturing` + - `type` integer +### `durable_goods` + - `type` integer +### `nondurable_goods` + - `type` integer +### `trade_transportation_utilties` + - `type` integer +### `wholesale_trade` + - `type` number +### `retail_trade` + - `type` number +### `transportation_and_warehousing` + - `type` number +### `utilities` + - `type` number +### `information` + - `type` integer +### `financial_activities` + - `type` integer +### `professional_and_business_services` + - `type` integer +### `education_and_health_services` + - `type` integer +### `leisure_and_hospitality` + - `type` integer +### `other_services` + - `type` integer +### `government` + - `type` integer +### `nonfarm_change` + - `type` integer +## `us-state-capitals.json` + - `path` us-state-capitals.json + - `schema` + +### `lon` + - `type` number +### `lat` + - `type` number +### `state` + - `type` string +### `city` + - `type` string +## `volcano.json` + - `description` Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. + This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a + topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate. + - `path` volcano.json +## `weather.csv` + - `description` NOAA data transformed using `/scripts/weather.py`. Categorical "weather" field synthesized + from multiple fields in the original dataset. This data is intended for instructional purposes. + - `path` weather.csv + - `schema` + +### `location` + - `type` string +### `date` + - `type` date +### `precipitation` + - `type` number +### `temp_max` + - `type` number +### `temp_min` + - `type` number +### `wind` + - `type` number +### `weather` + - `type` string +## `weather.json` + - `description` Instructional dataset showing actual and predicted temperature data. + - `path` weather.json +## `wheat.json` + - `description` In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), + a Scottish engineer who is often credited as the founder of statistical graphics, + published an elegant chart on the price of wheat. It plots 250 years of prices alongside + weekly wages and the reigning monarch. He intended to demonstrate that "never at any former period + was wheat so cheap, in proportion to mechanical labour, as it is at the present time." + - `path` wheat.json + - `schema` + +### `year` + - `type` integer +### `wheat` + - `type` number +### `wages` + - `type` number +## `windvectors.csv` + - `description` Simulated wind patterns over northwestern Europe. + - `path` windvectors.csv + - `schema` + +### `longitude` + - `type` number +### `latitude` + - `type` number +### `dir` + - `type` integer +### `dirCat` + - `type` integer +### `speed` + - `type` number +## `world-110m.json` + - `path` world-110m.json +## `zipcodes.csv` + - `description` GeoNames.org + - `path` zipcodes.csv + - `schema` + +### `zip_code` + - `type` integer +### `latitude` + - `type` number +### `longitude` + - `type` number +### `city` + - `type` string +### `state` + - `type` string +### `county` + - `type` string \ No newline at end of file diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index 92a5a5aa..f1aceba6 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -47,6 +47,7 @@ from typing import ( TYPE_CHECKING, Any, + Concatenate, LiteralString, NotRequired, Required, @@ -87,6 +88,7 @@ logger = logging.getLogger() type ResourceConstructor = Callable[..., Resource] +type PackageMethod[**P] = Callable[Concatenate[Package, P], Any] type PathMeta = Literal["name", "path", "scheme", "mediatype"] type PythonDataType = ( type[ @@ -107,8 +109,11 @@ | None ) +type OutputFormat = Literal["json", "yaml", "md", "md-tabular"] + ADDITIONS_TOML: LiteralString = "datapackage_additions.toml" NPM_PACKAGE: Literal["package.json"] = "package.json" +DATAPACKAGE: Literal["datapackage"] = "datapackage" POLARS_PY_TO_FL_FIELD: Mapping[PythonDataType, type[fl.Field]] = { int: IntegerField, @@ -451,10 +456,25 @@ def read_json(fp: Path, /) -> Any: return json.load(f) +def write_package(pkg: Package, repo_dir: Path, *formats: OutputFormat) -> None: + """Write the final datapackage in one or more formats.""" + configs: dict[OutputFormat, tuple[str, PackageMethod[str]]] = { + "json": (".json", partial(Package.to_json)), + "yaml": (".yaml", partial(Package.to_yaml)), + "md": (".md", partial(Package.to_markdown)), + "md-tabular": ("-tabular.md", partial(Package.to_markdown, table=True)), + } + for fmt in formats: + postfix, fn = configs[fmt] + p = (repo_dir / f"{DATAPACKAGE}{postfix}").as_posix() + msg = f"Writing {p!r}" + logger.info(msg) + fn(pkg, p) + + def main( *, - stem: str = "datapackage", - output_format: Literal["json", "yaml", "both"] = "json", + output_format: Literal["json", "yaml"] = "json", ) -> None: if output_format not in {"json", "yaml", "both"}: msg = f"Expected one of {['json', 'yaml', 'both']!r} but got {output_format!r}" @@ -476,15 +496,8 @@ def main( pkg = Package(resources=list(iter_resources(data_dir, overrides)), **pkg_meta) # type: ignore[arg-type] msg = f"Collected {len(pkg.resources)} resources" logger.info(msg) - if output_format in {"json", "both"}: - p = (repo_dir / f"{stem}.json").as_posix() - logger.info(msg) - pkg.to_json(p) - if output_format in {"yaml", "both"}: - p = (repo_dir / f"{stem}.yaml").as_posix() - msg = f"Writing {p!r}" - logger.info(msg) - pkg.to_yaml(p) + DEBUG_MARKDOWN = "md", "md-tabular" + write_package(pkg, repo_dir, output_format, *DEBUG_MARKDOWN) if __name__ == "__main__": From 79a30cc2073fe5469fb899f7f264bcee180322cb Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 13:43:36 +0000 Subject: [PATCH 22/40] refactor: remove `"both"` option from `output_format` --- scripts/build_datapackage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index f1aceba6..5860d7cc 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -476,8 +476,8 @@ def main( *, output_format: Literal["json", "yaml"] = "json", ) -> None: - if output_format not in {"json", "yaml", "both"}: - msg = f"Expected one of {['json', 'yaml', 'both']!r} but got {output_format!r}" + if output_format not in {"json", "yaml"}: + msg = f"Expected one of {['json', 'yaml']!r} but got {output_format!r}" raise TypeError(msg) repo_dir: Path = Path(__file__).parent.parent data_dir: Path = repo_dir / "data" From 80dd7d5df951862a17ce8cef8d598b61be5e6a07 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:18:29 +0000 Subject: [PATCH 23/40] refactor(typing): Always return `Resource` in `ResourceAdapter.from_path` --- scripts/build_datapackage.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index 5860d7cc..340024e9 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -156,19 +156,30 @@ class ResourceAdapter: """https://www.iana.org/assignments/media-types/application/vnd.apache.arrow.file""" @classmethod - def from_path(cls, source: Path, /) -> Resource | None: - suffix = source.suffix - match suffix: + def is_supported(cls, source: Path, /) -> bool: + return source.suffix in { + ".csv", + ".tsv", + ".json", + ".parquet", + ".png", + ".jpg", + ".arrow", + } + + @classmethod + def from_path(cls, source: Path, /) -> Resource: + match source.suffix: case ".csv" | ".tsv" | ".parquet": return cls.from_tabular_safe(source) case ".json": return cls.from_json(source) - case ".png": + case ".png" | ".jpg": return cls.from_image(source) case ".arrow": return cls.from_arrow(source) case _: - return None + raise TypeError(source.suffix) @classmethod def infer_as(cls, source: Path, tp: ResourceConstructor, /) -> Resource: @@ -436,15 +447,15 @@ def iter_resources( Additional metadata, with a higher precedence than inferred. """ for fp in iter_data_dir(root): - if resource := ResourceAdapter.from_path(fp): - name = fp.name - if name in overrides: - resource = ResourceAdapter.with_extras(resource, **overrides[name]) - yield resource - else: + if not ResourceAdapter.is_supported(fp): msg = f"Skipping unexpected extension {fp.suffix!r}\n\n{fp!r}" warnings.warn(msg, stacklevel=2) continue + resource = ResourceAdapter.from_path(fp) + name = fp.name + if name in overrides: + resource = ResourceAdapter.with_extras(resource, **overrides[name]) + yield resource def read_toml(fp: Path, /) -> dict[str, Any]: From 51fb7fca853413b7b287c50d1b8f27b766e27a76 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 17:12:06 +0000 Subject: [PATCH 24/40] refactor: Remove non-tabular markdown option The fix I have for this issue requires maintaining our own templates (https://github.com/vega/vega-datasets/pull/643#discussion_r1884039627) Not looking to maintain two sets of templates --- datapackage.md | 1386 ---------------------------------- scripts/build_datapackage.py | 7 +- 2 files changed, 3 insertions(+), 1390 deletions(-) delete mode 100644 datapackage.md diff --git a/datapackage.md b/datapackage.md deleted file mode 100644 index 7cb8f531..00000000 --- a/datapackage.md +++ /dev/null @@ -1,1386 +0,0 @@ -# `vega-datasets`- `description` Common repository for example datasets used by Vega related projects. -- `homepage` http://github.com/vega/vega-datasets.git -- `licenses` - - [1] - - `name` BSD-3-Clause - - `path` https://opensource.org/license/bsd-3-clause - - `title` The 3-Clause BSD License -- `contributors` - - [1] - - `title` UW Interactive Data Lab - - `path` http://idl.cs.washington.edu -- `version` 2.11.0 -- `created` 2024-12-13T12:53:03.887410+00:00 -## `7zip.png` - - `description` Application icons from open-source software projects. - - `path` 7zip.png -## `airports.csv` - - `path` airports.csv - - `schema` - -### `iata` - - `type` string -### `name` - - `type` string -### `city` - - `type` string -### `state` - - `type` string -### `country` - - `type` string -### `latitude` - - `type` number -### `longitude` - - `type` number -## `annual-precip.json` - - `description` A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell. - - `path` annual-precip.json -## `anscombe.json` - - `description` Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician. - - `path` anscombe.json - - `schema` - -### `Series` - - `type` string -### `X` - - `type` integer -### `Y` - - `type` number -## `barley.json` - - `description` The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. - - It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper "Statistical Determination of Barley Varietal Adaption". - - R.A. Fisher's popularized its use in the field of statistics when he included it in his book "The Design of Experiments". - - Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s. - - - `path` barley.json - - `schema` - -### `yield` - - `type` number -### `variety` - - `type` string -### `year` - - `type` integer -### `site` - - `type` string -## `birdstrikes.csv` - - `description` Records of reported wildlife strikes received by the U.S. FAA - - `path` birdstrikes.csv - - `schema` - -### `Airport Name` - - `type` string -### `Aircraft Make Model` - - `type` string -### `Effect Amount of damage` - - `type` string -### `Flight Date` - - `type` date -### `Aircraft Airline Operator` - - `type` string -### `Origin State` - - `type` string -### `Phase of flight` - - `type` string -### `Wildlife Size` - - `type` string -### `Wildlife Species` - - `type` string -### `Time of day` - - `type` string -### `Cost Other` - - `type` integer -### `Cost Repair` - - `type` integer -### `Cost Total $` - - `type` integer -### `Speed IAS in knots` - - `type` integer -## `budget.json` - - `description` Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget. - - `path` budget.json - - `schema` - -### `Source Category Code` - - `type` integer -### `Source category name` - - `type` string -### `Source subcategory` - - `type` integer -### `Source subcategory name` - - `type` string -### `Agency code` - - `type` integer -### `Agency name` - - `type` string -### `Bureau code` - - `type` integer -### `Bureau name` - - `type` string -### `Account code` - - `type` integer -### `Account name` - - `type` string -### `Treasury Agency code` - - `type` integer -### `On- or off-budget` - - `type` string -### `1962` - - `type` string -### `1963` - - `type` string -### `1964` - - `type` string -### `1965` - - `type` string -### `1966` - - `type` string -### `1967` - - `type` string -### `1968` - - `type` string -### `1969` - - `type` string -### `1970` - - `type` string -### `1971` - - `type` string -### `1972` - - `type` string -### `1973` - - `type` string -### `1974` - - `type` string -### `1975` - - `type` string -### `1976` - - `type` string -### `TQ` - - `type` string -### `1977` - - `type` string -### `1978` - - `type` string -### `1979` - - `type` string -### `1980` - - `type` string -### `1981` - - `type` string -### `1982` - - `type` string -### `1983` - - `type` string -### `1984` - - `type` string -### `1985` - - `type` string -### `1986` - - `type` string -### `1987` - - `type` string -### `1988` - - `type` string -### `1989` - - `type` string -### `1990` - - `type` string -### `1991` - - `type` string -### `1992` - - `type` string -### `1993` - - `type` string -### `1994` - - `type` string -### `1995` - - `type` string -### `1996` - - `type` string -### `1997` - - `type` string -### `1998` - - `type` string -### `1999` - - `type` string -### `2000` - - `type` string -### `2001` - - `type` string -### `2002` - - `type` string -### `2003` - - `type` string -### `2004` - - `type` string -### `2005` - - `type` string -### `2006` - - `type` string -### `2007` - - `type` string -### `2008` - - `type` string -### `2009` - - `type` string -### `2010` - - `type` string -### `2011` - - `type` string -### `2012` - - `type` string -### `2013` - - `type` string -### `2014` - - `type` string -### `2015` - - `type` string -### `2016` - - `type` string -### `2017` - - `type` string -### `2018` - - `type` string -### `2019` - - `type` string -### `2020` - - `type` string -## `budgets.json` - - `path` budgets.json - - `schema` - -### `budgetYear` - - `type` integer -### `forecastYear` - - `type` integer -### `value` - - `type` number -## `burtin.json` - - `description` The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. - - The dataset compares the performance of three antibiotics against 16 different bacteria. - - Numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. - - The dataset was featured as an example in the Protovis project, a precursor to D3.js. - - As noted in the Protovis example, "Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin". - - The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. - - The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) - reads as follows: - - > ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin - > - > - > The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in > red and gram- in blue) with their sensitivities to penicillin, and streptomycin. - > - > The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits > the test organism. - > - > High dilutions are toward the periphery; consequently the length of the colored bar is proportional > to the effectiveness. - > - > It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. > fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. > vulgaris, S. schottmuelleri and M. tuberculosis. - > - > Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to > neomycin, although the majority of these are sensitive to neomycin. - > - > It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood. - - - `path` burtin.json - - `schema` - -### `Bacteria` - - `type` string -### `Penicillin` - - `type` number -### `Streptomycin` - - `type` number -### `Neomycin` - - `type` number -### `Gram_Staining` - - `type` string -### `Genus` - - `type` string -## `cars.json` - - `description` Collection of car specifications and performance metrics from various automobile manufacturers. - - `path` cars.json - - `schema` - -### `Name` - - `type` string -### `Miles_per_Gallon` - - `type` integer -### `Cylinders` - - `type` integer -### `Displacement` - - `type` number -### `Horsepower` - - `type` integer -### `Weight_in_lbs` - - `type` integer -### `Acceleration` - - `type` number -### `Year` - - `type` date -### `Origin` - - `type` string -## `co2-concentration.csv` - - `description` Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. - Only includes rows with valid data. - - `path` co2-concentration.csv - - `schema` - -### `Date` - - `type` date -### `CO2` - - `type` number -### `adjusted CO2` - - `type` number -## `countries.json` - - `description` This dataset combines key demographic indicators (life expectancy at birth and - fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year - intervals. It includes both current values and adjacent time period values (previous and next) - for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) - notes that its philosophy is to fill data gaps with estimates and use current - geographic boundaries for historical data. Gapminder states that it aims to "show people the - big picture" rather than support detailed numeric analysis. - - `path` countries.json - - `schema` - -### `_comment` - - `type` string -### `year` - - `description` Years from 1955 to 2000 at 5-year intervals - - `type` integer -### `fertility` - - `description` Fertility rate (average number of children per woman) for the given year - - `type` number -### `life_expect` - - `description` Life expectancy in years for the given year - - `type` number -### `n_fertility` - - `description` Fertility rate for the next 5-year interval - - `type` number -### `n_life_expect` - - `description` Life expectancy for the next 5-year interval - - `type` number -### `country` - - `description` Name of the country - - `type` string -## `crimea.json` - - `path` crimea.json - - `schema` - -### `date` - - `type` date -### `wounds` - - `type` integer -### `other` - - `type` integer -### `disease` - - `type` integer -## `disasters.csv` - - `description` Annual number of deaths from disasters. - - `path` disasters.csv - - `schema` - -### `Entity` - - `type` string -### `Year` - - `type` integer -### `Deaths` - - `type` integer -## `driving.json` - - `path` driving.json - - `schema` - -### `side` - - `type` string -### `year` - - `type` integer -### `miles` - - `type` integer -### `gas` - - `type` number -## `earthquakes.json` - - `description` Earthquake data retrieved Feb 6, 2018 - - `path` earthquakes.json -## `ffox.png` - - `description` Application icons from open-source software projects. - - `path` ffox.png -## `flare-dependencies.json` - - `path` flare-dependencies.json - - `schema` - -### `source` - - `type` integer -### `target` - - `type` integer -## `flare.json` - - `path` flare.json - - `schema` - -### `id` - - `type` integer -### `name` - - `type` string -## `flights-10k.json` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-10k.json - - `schema` - -### `date` - - `type` string -### `delay` - - `type` integer -### `distance` - - `type` integer -### `origin` - - `type` string -### `destination` - - `type` string -## `flights-200k.arrow` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-200k.arrow - - `schema` - -### `delay` - - `type` integer -### `distance` - - `type` integer -### `time` - - `type` number -## `flights-200k.json` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-200k.json - - `schema` - -### `delay` - - `type` integer -### `distance` - - `type` integer -### `time` - - `type` number -## `flights-20k.json` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-20k.json - - `schema` - -### `date` - - `type` string -### `delay` - - `type` integer -### `distance` - - `type` integer -### `origin` - - `type` string -### `destination` - - `type` string -## `flights-2k.json` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-2k.json - - `schema` - -### `date` - - `type` string -### `delay` - - `type` integer -### `distance` - - `type` integer -### `origin` - - `type` string -### `destination` - - `type` string -## `flights-3m.parquet` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-3m.parquet - - `schema` - -### `date` - - `type` datetime -### `delay` - - `type` integer -### `distance` - - `type` integer -### `origin` - - `type` string -### `destination` - - `type` string -## `flights-5k.json` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-5k.json - - `schema` - -### `date` - - `type` string -### `delay` - - `type` integer -### `distance` - - `type` integer -### `origin` - - `type` string -### `destination` - - `type` string -## `flights-airport.csv` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-airport.csv - - `schema` - -### `origin` - - `type` string -### `destination` - - `type` string -### `count` - - `type` integer -## `football.json` - - `description` Football match outcomes across multiple divisions from 2013 to 2017, part of a - larger dataset from OpenFootball. The subset was made such that there are records for all five - chosen divisions over the time period. - - `path` football.json - - `schema` - -### `date` - - `type` date -### `division` - - `type` string -### `home_team` - - `type` string -### `away_team` - - `type` string -### `home_score` - - `type` integer -### `away_score` - - `type` integer -## `gapminder-health-income.csv` - - `description` Per-capita income, life expectancy, population and regional grouping. Dataset does not specify - the reference year for the data. Gapminder historical data is subject to revisions. - - Gapminder (v30, 2023) defines per-capita income as follows: - >"This is real GDP per capita (gross domestic product per person adjusted for inflation) - >converted to international dollars using purchasing power parity rates. An international dollar - >has the same purchasing power over GDP as the U.S. dollar has in the United States." - - - `path` gapminder-health-income.csv - - `schema` - -### `country` - - `type` string -### `income` - - `type` integer -### `health` - - `type` number -### `population` - - `type` integer -### `region` - - `type` string -## `gapminder.json` - - `description` This dataset combines key demographic indicators (life expectancy at birth, - population, and fertility rate measured as babies per woman) for various countries from 1955 - to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable - grouping countries. Gapminder's data documentation notes that its philosophy is to fill data - gaps with estimates and use current geographic boundaries for historical data. Gapminder - states that it aims to "show people the big picture" rather than support detailed numeric - analysis. - - Notes: - 1. Country Selection: The set of countries in this file matches the version of this dataset - originally added to this collection in 2015. The specific criteria for country selection - in that version are not known. Data for Aruba are no longer available in the new version. - Hong Kong has been revised to Hong Kong, China in the new version. - - 2. Data Precision: The precision of float values may have changed from the original version. - These changes reflect the most recent source data used for each indicator. - - 3. Regional Groupings: The 'cluster' column represents a regional mapping of countries - corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To - preserve continuity with previous versions of this dataset, we have retained the column - name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: - `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`. - - `path` gapminder.json - - `schema` - -### `year` - - `description` Years from 1955 to 2005 at 5-year intervals - - `type` integer -### `country` - - `description` Name of the country - - `type` string -### `cluster` - - `description` A categorical variable (values 0-5) grouping countries by region - - `type` integer -### `pop` - - `description` Population of the country - - `type` integer -### `life_expect` - - `description` Life expectancy in years - - `type` number -### `fertility` - - `description` Fertility rate (average number of children per woman - - `type` number -## `gimp.png` - - `description` Application icons from open-source software projects. - - `path` gimp.png -## `github.csv` - - `description` Generated using `/scripts/github.py`. - - `path` github.csv - - `schema` - -### `time` - - `type` string -### `count` - - `type` integer -## `global-temp.csv` - - `description` Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023. - - `path` global-temp.csv - - `schema` - -### `year` - - `type` integer -### `temp` - - `type` number -## `income.json` - - `path` income.json - - `schema` - -### `name` - - `type` string -### `region` - - `type` string -### `id` - - `type` integer -### `pct` - - `type` number -### `total` - - `type` integer -### `group` - - `type` string -## `iowa-electricity.csv` - - `description` The state of Iowa has dramatically increased its production of renewable - wind power in recent years. This file contains the annual net generation of electricity in - the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. - It is useful for illustrating stacked area charts. - - `path` iowa-electricity.csv - - `schema` - -### `year` - - `type` date -### `source` - - `type` string -### `net_generation` - - `type` integer -## `jobs.json` - - `description` U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census microdata" from as early as 1790. - - Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). - The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). - - Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). - - IPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating: - >We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared. - - This dataset contains only summary statistics and does not include any underlying microdata records. - - 1. This dataset represents summary data. The underlying microdata records are not included. - 2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) - (person weight) variable as an expansion factor when working with IPUMS USA extracts. - 3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly. - - When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). - The organization requests use of the following citation for this json file: - - Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0 - - - `path` jobs.json - - `schema` - -### `job` - - `description` The occupation title - - `type` string -### `sex` - - `description` Sex (men/women) - - `type` string -### `year` - - `description` Census year - - `type` integer -### `count` - - `description` Number of individuals in the occupation - - `type` integer -### `perc` - - `description` Percentage of the workforce in the occupation - - `type` number -## `la-riots.csv` - - `description` More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles - for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic - coordinates of their death. Compiled and published by the Los Angeles Times Data Desk. - - `path` la-riots.csv - - `schema` - -### `first_name` - - `type` string -### `last_name` - - `type` string -### `age` - - `type` integer -### `gender` - - `type` string -### `race` - - `type` string -### `death_date` - - `type` date -### `address` - - `type` string -### `neighborhood` - - `type` string -### `type` - - `type` string -### `longitude` - - `type` number -### `latitude` - - `type` number -## `londonboroughs.json` - - `description` Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. - Original data "contains National Statistics data © Crown copyright and database right (2015)" - and "Contains Ordnance Survey data © Crown copyright and database right [2015]. - - `path` londonBoroughs.json -## `londoncentroids.json` - - `description` Calculated from `londongBoroughs.json` using `d3.geoCentroid`. - - `path` londonCentroids.json - - `schema` - -### `name` - - `type` string -### `cx` - - `type` number -### `cy` - - `type` number -## `londontubelines.json` - - `description` Selected rail lines simplified from source. - - `path` londonTubeLines.json -## `lookup_groups.csv` - - `path` lookup_groups.csv - - `schema` - -### `group` - - `type` integer -### `person` - - `type` string -## `lookup_people.csv` - - `path` lookup_people.csv - - `schema` - -### `name` - - `type` string -### `age` - - `type` integer -### `height` - - `type` integer -## `miserables.json` - - `path` miserables.json -## `monarchs.json` - - `description` A chronological list of English and British monarchs from Elizabeth I through George IV. - Each entry includes: - - The dataset contains two intentional inaccuracies to maintain compatibility with - the [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization: - 1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558; - 2. the end date for the reign of George IV is shown as 1820, instead of 1830. - These discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization. - The entry "W&M" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, - the official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702. - The `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, - and the period leading to the Restoration. While historically more accurate to call this the "interregnum," the field name of `commonwealth` - from the original dataset is retained for backwards compatibility. - The dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689). - Source data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024). - Content on the site is protected by Crown Copyright. - Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most - Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). - - `path` monarchs.json - - `schema` - -### `name` - - `description` The ruler's name or identifier (e.g., "W&M" for William and Mary, "Cromwell" for the period of interregnum) - - `type` string -### `start` - - `description` The year their rule began - - `type` integer -### `end` - - `description` The year their rule ended - - `type` integer -### `index` - - `description` A zero-based sequential number assigned to each entry, representing the chronological order of rulers - - `type` integer -## `movies.json` - - `description` The dataset has well known and intentionally included errors. - This dataset is provided for instructional purposes, including the need to reckon with dirty data. - - `path` movies.json - - `schema` - -### `Title` - - `type` string -### `US Gross` - - `type` integer -### `Worldwide Gross` - - `type` integer -### `US DVD Sales` - - `type` integer -### `Production Budget` - - `type` integer -### `Release Date` - - `type` string -### `MPAA Rating` - - `type` string -### `Running Time min` - - `type` integer -### `Distributor` - - `type` string -### `Source` - - `type` string -### `Major Genre` - - `type` string -### `Creative Type` - - `type` string -### `Director` - - `type` string -### `Rotten Tomatoes Rating` - - `type` integer -### `IMDB Rating` - - `type` number -### `IMDB Votes` - - `type` integer -## `normal-2d.json` - - `path` normal-2d.json - - `schema` - -### `u` - - `type` number -### `v` - - `type` number -## `obesity.json` - - `path` obesity.json - - `schema` - -### `id` - - `type` integer -### `rate` - - `type` number -### `state` - - `type` string -## `ohlc.json` - - `description` This dataset contains the performance of the Chicago Board Options Exchange - [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/ - %5EVIX#overview)) in the summer of 2009. - - `path` ohlc.json - - `schema` - -### `date` - - `type` date -### `open` - - `type` number -### `high` - - `type` number -### `low` - - `type` number -### `close` - - `type` number -### `signal` - - `type` string -### `ret` - - `type` number -## `penguins.json` - - `description` Palmer Archipelago (Antarctica) penguin data collected and made available by - [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) - and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research - Network](https://lternet.edu/). - - `path` penguins.json - - `schema` - -### `Species` - - `type` string -### `Island` - - `type` string -### `Beak Length (mm)` - - `type` number -### `Beak Depth (mm)` - - `type` number -### `Flipper Length (mm)` - - `type` integer -### `Body Mass (g)` - - `type` integer -### `Sex` - - `type` string -## `platformer-terrain.json` - - `description` Assets from the video game Celeste. - - `path` platformer-terrain.json - - `schema` - -### `x` - - `type` integer -### `y` - - `type` integer -### `lumosity` - - `type` number -### `saturation` - - `type` integer -### `name` - - `type` string -### `id` - - `type` string -### `color` - - `type` string -### `key` - - `type` string -## `points.json` - - `path` points.json - - `schema` - -### `x` - - `type` number -### `y` - - `type` number -## `political-contributions.json` - - `description` Summary financial information on contributions to candidates for U.S. - elections. An updated version of this datset is available from the "all candidates" files - (in pipe-delimited format) on the bulk data download page of the U.S. Federal Election - Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is - available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). - The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. - - FEC data is subject to the commission's: - - [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/) - - [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/) - - [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md) - - Additionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states: - > This project is in the public domain within the United States, and we waive worldwide - > copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/) - > dedication. Read more on our license page. - > A few restrictions limit the way you can use FEC data. For example, you can't use - > contributor lists for commercial purposes or to solicit donations. Learn more on - > [FEC.gov](https://www.fec.gov/). - - `path` political-contributions.json - - `schema` - -### `Candidate_Identification` - - `type` string -### `Candidate_Name` - - `type` string -### `Incumbent_Challenger_Status` - - `type` string -### `Party_Code` - - `type` integer -### `Party_Affiliation` - - `type` string -### `Total_Receipts` - - `type` number -### `Transfers_from_Authorized_Committees` - - `type` integer -### `Total_Disbursements` - - `type` number -### `Transfers_to_Authorized_Committees` - - `type` number -### `Beginning_Cash` - - `type` number -### `Ending_Cash` - - `type` number -### `Contributions_from_Candidate` - - `type` number -### `Loans_from_Candidate` - - `type` integer -### `Other_Loans` - - `type` integer -### `Candidate_Loan_Repayments` - - `type` number -### `Other_Loan_Repayments` - - `type` integer -### `Debts_Owed_By` - - `type` number -### `Total_Individual_Contributions` - - `type` integer -### `Candidate_State` - - `type` string -### `Candidate_District` - - `type` integer -### `Contributions_from_Other_Political_Committees` - - `type` integer -### `Contributions_from_Party_Committees` - - `type` integer -### `Coverage_End_Date` - - `type` string -### `Refunds_to_Individuals` - - `type` integer -### `Refunds_to_Committees` - - `type` integer -## `population.json` - - `description` United States population statistics by sex and age group across decades between 1850 and 2000. - The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census - microdata" from as early as 1790. - - IPUMS updates and revises datasets over time, which may result in discrepancies between this - dataset and current IPUMS data. Details on data revisions are available here. - - When using this dataset, please refer to IPUMS USA terms of use. The organization requests the - use of the following citation for this json file: - Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated - Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. - http://doi.org/10.18128/D010.V6.0 - - - `path` population.json - - `schema` - -### `year` - - `description` Four-digit year of the survey - - `type` integer -### `age` - - `description` Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+) - - `type` integer -### `sex` - - `description` Sex (1=men, 2=women) - - `type` integer -### `people` - - `description` Number of individuals (IPUMS PERWT) - - `type` integer -## `population_engineers_hurricanes.csv` - - `description` Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, - [Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html) - - `path` population_engineers_hurricanes.csv - - `schema` - -### `state` - - `type` string -### `id` - - `type` integer -### `population` - - `type` integer -### `engineers` - - `type` number -### `hurricanes` - - `type` integer -## `seattle-weather-hourly-normals.csv` - - `description` Hourly weather normals with metric units. The 1981-2010 Climate Normals are - NCDC's three-decade averages of climatological variables, including temperature and - precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). - We only included temperature, wind, and pressure - and updated the format to be easier to parse. - - `path` seattle-weather-hourly-normals.csv - - `schema` - -### `date` - - `type` datetime -### `pressure` - - `type` number -### `temperature` - - `type` number -### `wind` - - `type` number -## `seattle-weather.csv` - - `description` Daily weather records with metric units. Transformed using `/scripts/weather.py`. - The categorical "weather" field is synthesized from multiple fields in the original dataset. - This data is intended for instructional purposes. - - `path` seattle-weather.csv - - `schema` - -### `date` - - `type` date -### `precipitation` - - `type` number -### `temp_max` - - `type` number -### `temp_min` - - `type` number -### `wind` - - `type` number -### `weather` - - `type` string -## `sp500-2000.csv` - - `description` S&P 500 index values from 2000 to 2020. - - `path` sp500-2000.csv - - `schema` - -### `date` - - `type` date -### `open` - - `type` number -### `high` - - `type` number -### `low` - - `type` number -### `close` - - `type` number -### `adjclose` - - `type` number -### `volume` - - `type` integer -## `sp500.csv` - - `path` sp500.csv - - `schema` - -### `date` - - `type` string -### `price` - - `type` number -## `stocks.csv` - - `path` stocks.csv - - `schema` - -### `symbol` - - `type` string -### `date` - - `type` string -### `price` - - `type` number -## `udistrict.json` - - `path` udistrict.json - - `schema` - -### `key` - - `type` string -### `lat` - - `type` number -## `unemployment-across-industries.json` - - `description` Industry-level unemployment statistics from the Current Population Survey - (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons - and unemployment rate across 11 private industries, as well as agricultural, government, and - self-employed workers. Covers January 2000 through February 2010. Industry classification - follows format of CPS Table A-31. - - The dataset can be replicated using the BLS API. For more, see the `scripts` folder of this - repository. - - The BLS Web site states: - > "Users of the public API should cite the date that data were accessed or retrieved using - > the API. Users must clearly state that "BLS.gov cannot vouch for the data or analyses - > derived from these data after the data have been retrieved from BLS.gov." The BLS.gov logo - > may not be used by persons who are not BLS employees or on products (including web pages) - > that are not BLS-sponsored." - - See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm). - - `path` unemployment-across-industries.json - - `schema` - -### `series` - - `description` Industry name - - `type` string -### `year` - - `description` Year (2000-2010) - - `type` integer -### `month` - - `description` Month (1-12) - - `type` integer -### `count` - - `description` Number of unemployed persons (in thousands) - - `type` integer -### `rate` - - `description` Unemployment rate (percentage) - - `type` number -### `date` - - `description` ISO 8601-formatted date string (e.g., "2000-01-01T08:00:00.000Z") - - `type` datetime -## `unemployment.tsv` - - `description` This dataset contains county-level unemployment rates in the United States, with data generally - consistent with levels reported in 2009. The dataset is structured as tab-separated values. - The unemployment rate represents the number of unemployed persons as a percentage of the labor - force. According to the Bureau of Labor Statistics (BLS) glossary: - - Unemployed persons (Current Population Survey) [are] persons aged 16 years and older who had - no employment during the reference week, were available for work, except for temporary - illness, and had made specific efforts to find employment sometime during the 4-week period - ending with the reference week. Persons who were waiting to be recalled to a job from which - they had been laid off need not have been looking for work to be classified as unemployed. - - This dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, - a federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). - The LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions, - states, counties, metropolitan areas, and many cities and towns. - - For the most up-to-date LAUS data: - 1. **Monthly and Annual Data Downloads**: - - Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) - and [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data. - 2. **BLS Public Data API**: - - The BLS provides an API for developers to access various datasets, including LAUS data. - - To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query. - - API documentation and examples are available on the BLS Developers page. - - When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm). - - `path` unemployment.tsv - - `schema` - -### `id` - - `description` The combined state and county FIPS code - - `type` integer -### `rate` - - `description` The unemployment rate for the county - - `type` number -## `uniform-2d.json` - - `path` uniform-2d.json - - `schema` - -### `u` - - `type` number -### `v` - - `type` number -## `us-10m.json` - - `path` us-10m.json -## `us-employment.csv` - - `description` In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job - losses across the United States. The downturn in employment, and the slow recovery in hiring that - followed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau - of Labor Statistics. - - This file contains the monthly employment total in a variety of job categories from January 2006 - through December 2015. The numbers are seasonally adjusted and reported in thousands. The data - were downloaded on Nov. 11, 2018, and reformatted for use in this library. - - Totals are included for the [22 "supersectors"](https://download.bls.gov/pub/time.series/ce/ce.supersector) - tracked by the BLS. The "nonfarm" total is the category typically used by - economists and journalists as a stand-in for the country's employment total. - - A calculated "nonfarm_change" column has been appended with the month-to-month change in that - supersector's employment. It is useful for illustrating how to make bar charts that report both - negative and positive values. - - - `path` us-employment.csv - - `schema` - -### `month` - - `type` date -### `nonfarm` - - `type` integer -### `private` - - `type` integer -### `goods_producing` - - `type` integer -### `service_providing` - - `type` integer -### `private_service_providing` - - `type` integer -### `mining_and_logging` - - `type` integer -### `construction` - - `type` integer -### `manufacturing` - - `type` integer -### `durable_goods` - - `type` integer -### `nondurable_goods` - - `type` integer -### `trade_transportation_utilties` - - `type` integer -### `wholesale_trade` - - `type` number -### `retail_trade` - - `type` number -### `transportation_and_warehousing` - - `type` number -### `utilities` - - `type` number -### `information` - - `type` integer -### `financial_activities` - - `type` integer -### `professional_and_business_services` - - `type` integer -### `education_and_health_services` - - `type` integer -### `leisure_and_hospitality` - - `type` integer -### `other_services` - - `type` integer -### `government` - - `type` integer -### `nonfarm_change` - - `type` integer -## `us-state-capitals.json` - - `path` us-state-capitals.json - - `schema` - -### `lon` - - `type` number -### `lat` - - `type` number -### `state` - - `type` string -### `city` - - `type` string -## `volcano.json` - - `description` Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. - This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a - topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate. - - `path` volcano.json -## `weather.csv` - - `description` NOAA data transformed using `/scripts/weather.py`. Categorical "weather" field synthesized - from multiple fields in the original dataset. This data is intended for instructional purposes. - - `path` weather.csv - - `schema` - -### `location` - - `type` string -### `date` - - `type` date -### `precipitation` - - `type` number -### `temp_max` - - `type` number -### `temp_min` - - `type` number -### `wind` - - `type` number -### `weather` - - `type` string -## `weather.json` - - `description` Instructional dataset showing actual and predicted temperature data. - - `path` weather.json -## `wheat.json` - - `description` In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), - a Scottish engineer who is often credited as the founder of statistical graphics, - published an elegant chart on the price of wheat. It plots 250 years of prices alongside - weekly wages and the reigning monarch. He intended to demonstrate that "never at any former period - was wheat so cheap, in proportion to mechanical labour, as it is at the present time." - - `path` wheat.json - - `schema` - -### `year` - - `type` integer -### `wheat` - - `type` number -### `wages` - - `type` number -## `windvectors.csv` - - `description` Simulated wind patterns over northwestern Europe. - - `path` windvectors.csv - - `schema` - -### `longitude` - - `type` number -### `latitude` - - `type` number -### `dir` - - `type` integer -### `dirCat` - - `type` integer -### `speed` - - `type` number -## `world-110m.json` - - `path` world-110m.json -## `zipcodes.csv` - - `description` GeoNames.org - - `path` zipcodes.csv - - `schema` - -### `zip_code` - - `type` integer -### `latitude` - - `type` number -### `longitude` - - `type` number -### `city` - - `type` string -### `state` - - `type` string -### `county` - - `type` string \ No newline at end of file diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index 340024e9..ec04f358 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -109,7 +109,7 @@ | None ) -type OutputFormat = Literal["json", "yaml", "md", "md-tabular"] +type OutputFormat = Literal["json", "yaml", "md"] ADDITIONS_TOML: LiteralString = "datapackage_additions.toml" NPM_PACKAGE: Literal["package.json"] = "package.json" @@ -472,8 +472,7 @@ def write_package(pkg: Package, repo_dir: Path, *formats: OutputFormat) -> None: configs: dict[OutputFormat, tuple[str, PackageMethod[str]]] = { "json": (".json", partial(Package.to_json)), "yaml": (".yaml", partial(Package.to_yaml)), - "md": (".md", partial(Package.to_markdown)), - "md-tabular": ("-tabular.md", partial(Package.to_markdown, table=True)), + "md": ("-tabular.md", partial(Package.to_markdown, table=True)), } for fmt in formats: postfix, fn = configs[fmt] @@ -507,7 +506,7 @@ def main( pkg = Package(resources=list(iter_resources(data_dir, overrides)), **pkg_meta) # type: ignore[arg-type] msg = f"Collected {len(pkg.resources)} resources" logger.info(msg) - DEBUG_MARKDOWN = "md", "md-tabular" + DEBUG_MARKDOWN = ("md",) write_package(pkg, repo_dir, output_format, *DEBUG_MARKDOWN) From ed5c5c801be02435708eb4698067a50d8d5ad034 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 18:27:56 +0000 Subject: [PATCH 25/40] feat: Adds `render_markdown_patch` Resolves https://github.com/vega/vega-datasets/pull/643#discussion_r1884039627 --- scripts/build_datapackage.py | 40 ++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index ec04f358..c223da07 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -71,6 +71,7 @@ StringField, TimeField, ) +from frictionless.formats.markdown import mapper as fl_markdown from frictionless.resources import ( JsonResource, MapResource, @@ -149,6 +150,45 @@ ) +def render_markdown_patch(path: str, data: dict[str, Any]) -> str: + """ + Patch to `frictionless.formats.markdown.mapper.render_markdown`_ to support template overrides. + + Declare a template with the same name as a default to override it: + + # Override directory: + vega-datasets/_data/templates/ + + # Default directory: + frictionless/assets/templates/ + + .. _frictionless.formats.markdown.mapper.render_markdown: + https://github.com/frictionlessdata/frictionless-py/blob/6b72909ee38403df7c0245f408f3881bfa56ad6f/frictionless/formats/markdown/mapper.py#L13-L43 + + Original doc + ------------ + Render any JSON-like object as Markdown, using jinja2 template. + """ + import jinja2 # noqa: PLC0415 + + # Create environ + default_dir: Path = Path(fl_markdown.__file__).parent / "../../assets/templates" + override_dir: Path = Path(__file__).parent.parent / "_data" / "templates" + searchpath = override_dir, default_dir + loader = jinja2.FileSystemLoader(searchpath) + environ = jinja2.Environment(loader=loader, lstrip_blocks=True, trim_blocks=True) + + # Render data + environ.filters["filter_dict"] = fl_markdown.filter_dict + environ.filters["dict_to_markdown"] = fl_markdown.dict_to_markdown + environ.filters["tabulate"] = fl_markdown.dicts_to_markdown_table + template = environ.get_template(path) + return template.render(**data) + + +fl_markdown.render_markdown = render_markdown_patch + + class ResourceAdapter: mediatype: ClassVar[Mapping[str, str]] = { ".arrow": "application/vnd.apache.arrow.file" From 20c44f20cac4fac5f8a638e105735a76bfab9bfb Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 18:29:41 +0000 Subject: [PATCH 26/40] feat: Adds template overrides for `Package`, `Resource` These can be extended if anyone has any ideas --- _data/templates/package-table.md | 15 +++++++++++++++ _data/templates/resource-table.md | 23 +++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 _data/templates/package-table.md create mode 100644 _data/templates/resource-table.md diff --git a/_data/templates/package-table.md b/_data/templates/package-table.md new file mode 100644 index 00000000..0daee636 --- /dev/null +++ b/_data/templates/package-table.md @@ -0,0 +1,15 @@ +# {{ package.name }} +`{{ package.version }}` | ([GitHub]({{ package.homepage }})) | ({{ package.created }}) + +{{ package.description }} + +## licenses +{{ package.licenses | tabulate() }} + +## contributors +{{ package.contributors | tabulate() }} + +# resources +{% for resource in package.resources %} + {% include 'resource-table.md' %} +{% endfor %} \ No newline at end of file diff --git a/_data/templates/resource-table.md b/_data/templates/resource-table.md new file mode 100644 index 00000000..9290060c --- /dev/null +++ b/_data/templates/resource-table.md @@ -0,0 +1,23 @@ +## `{{ resource.name }}`{% if resource.title %} {{ resource.title }}{% endif %} + +{% if resource.path %} +### path +{{ resource.path }} +{% endif %} +{% if resource.description %} +### description +{{ resource.description | indent(4, False) }} +{% endif %} +{% if resource.schema %} +### schema +{{ resource.schema | filter_dict(exclude=['fields']) | dict_to_markdown(level=2) }} +{{ resource.schema.fields | tabulate() }} +{% endif %} +{% if resource.sources %} +### sources +{{ resource.sources | tabulate() }} +{% endif %} +{% if resource.licenses %} +### licenses +{{ resource.licenses | tabulate() }} +{% endif %} From ea7cbd8ca59fc855872d04093343bd0898c99921 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 18:30:34 +0000 Subject: [PATCH 27/40] build: generate markdown w/ new templates https://github.com/vega/vega-datasets/pull/643#discussion_r1884149924 --- datapackage-tabular.md | 983 +++++++++++++++++++++++++++-------------- datapackage.json | 2 +- 2 files changed, 661 insertions(+), 324 deletions(-) diff --git a/datapackage-tabular.md b/datapackage-tabular.md index 15b8901e..f6787e06 100644 --- a/datapackage-tabular.md +++ b/datapackage-tabular.md @@ -1,24 +1,30 @@ -# `vega-datasets`- `description` Common repository for example datasets used by Vega related projects. -- `homepage` http://github.com/vega/vega-datasets.git -- `licenses` - - [1] - - `name` BSD-3-Clause - - `path` https://opensource.org/license/bsd-3-clause - - `title` The 3-Clause BSD License -- `contributors` - - [1] - - `title` UW Interactive Data Lab - - `path` http://idl.cs.washington.edu -- `version` 2.11.0 -- `created` 2024-12-13T12:53:03.887410+00:00 +# vega-datasets +`2.11.0` | ([GitHub](http://github.com/vega/vega-datasets.git)) | (2024-12-13T18:29:45.637443+00:00) + +Common repository for example datasets used by Vega related projects. + +## licenses +| name | path | title | +|:-------------|:--------------------------------------------|:-------------------------| +| BSD-3-Clause | https://opensource.org/license/bsd-3-clause | The 3-Clause BSD License | + +## contributors +| title | path | +|:------------------------|:-----------------------------| +| UW Interactive Data Lab | http://idl.cs.washington.edu | + +# resources ## `7zip.png` - - `description` Application icons from open-source software projects. - - `path` 7zip.png +### path +7zip.png +### description +Application icons from open-source software projects. ## `airports.csv` - - `path` airports.csv - - `schema` - - | name | type | +### path +airports.csv +### schema + +| name | type | |:----------|:-------| | iata | string | | name | string | @@ -28,20 +34,31 @@ | latitude | number | | longitude | number | ## `annual-precip.json` - - `description` A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell. - - `path` annual-precip.json +### path +annual-precip.json +### description +A raster grid of global annual precipitation for the year 2016 at a resolution 1 degree of lon/lat per cell. +### sources +| title | path | +|:----------------------------------|:-------------------------------------------------------------------------------------------------------| +| Climate Forecast System Version 2 | https://www.ncdc.noaa.gov/data-access/model-data/model-datasets/climate-forecast-system-version2-cfsv2 | ## `anscombe.json` - - `description` Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician. - - `path` anscombe.json - - `schema` - - | name | type | +### path +anscombe.json +### description +Graphs in Statistical Analysis, F. J. Anscombe, The American Statistician. +### schema + +| name | type | |:-------|:--------| | Series | string | | X | integer | | Y | number | ## `barley.json` - - `description` The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. +### path +barley.json +### description +The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper "Statistical Determination of Barley Varietal Adaption". @@ -49,21 +66,27 @@ Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s. - - `path` barley.json - - `schema` - - | name | type | +### schema + +| name | type | |:--------|:--------| | yield | number | | variety | string | | year | integer | | site | string | +### sources +| title | path | +|:------------------------------------|:---------------------------------------------------------------------| +| The Design of Experiments Reference | https://en.wikipedia.org/wiki/The_Design_of_Experiments | +| Trellis Charts Paper | http://ml.stat.purdue.edu/stat695t/writings/TrellisDesignControl.pdf | ## `birdstrikes.csv` - - `description` Records of reported wildlife strikes received by the U.S. FAA - - `path` birdstrikes.csv - - `schema` - - | name | type | +### path +birdstrikes.csv +### description +Records of reported wildlife strikes received by the U.S. FAA +### schema + +| name | type | |:--------------------------|:--------| | Airport Name | string | | Aircraft Make Model | string | @@ -79,12 +102,18 @@ | Cost Repair | integer | | Cost Total $ | integer | | Speed IAS in knots | integer | +### sources +| title | path | +|:-----------------------------|:------------------------| +| FAA Wildlife Strike Database | http://wildlife.faa.gov | ## `budget.json` - - `description` Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget. - - `path` budget.json - - `schema` - - | name | type | +### path +budget.json +### description +Historical and forecasted federal revenue/receipts produced in 2016 by the U.S. Office of Management and Budget. +### schema + +| name | type | |:------------------------|:--------| | Source Category Code | integer | | Source category name | string | @@ -158,17 +187,25 @@ | 2018 | string | | 2019 | string | | 2020 | string | +### sources +| title | path | +|:------------------------------------------------------------|:--------------------------------------------------------------------| +| Office of Management and Budget - Budget FY 2016 - Receipts | https://www.govinfo.gov/app/details/BUDGET-2016-DB/BUDGET-2016-DB-3 | ## `budgets.json` - - `path` budgets.json - - `schema` - - | name | type | +### path +budgets.json +### schema + +| name | type | |:-------------|:--------| | budgetYear | integer | | forecastYear | integer | | value | number | ## `burtin.json` - - `description` The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. +### path +burtin.json +### description +The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. The dataset compares the performance of three antibiotics against 16 different bacteria. @@ -198,10 +235,9 @@ > > It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood. - - `path` burtin.json - - `schema` - - | name | type | +### schema + +| name | type | |:--------------|:-------| | Bacteria | string | | Penicillin | number | @@ -209,12 +245,19 @@ | Neomycin | number | | Gram_Staining | string | | Genus | string | +### sources +| title | path | +|:-----------------------------|:---------------------------------------------------------------------| +| Scope Magazine | https://graphicdesignarchives.org/projects/scope-magazine-vol-iii-5/ | +| Protovis Antibiotics Example | https://mbostock.github.io/protovis/ex/antibiotics-burtin.html | ## `cars.json` - - `description` Collection of car specifications and performance metrics from various automobile manufacturers. - - `path` cars.json - - `schema` - - | name | type | +### path +cars.json +### description +Collection of car specifications and performance metrics from various automobile manufacturers. +### schema + +| name | type | |:-----------------|:--------| | Name | string | | Miles_per_Gallon | integer | @@ -225,29 +268,41 @@ | Acceleration | number | | Year | date | | Origin | string | +### sources +| title | path | +|:-------------------------|:----------------------------------| +| StatLib Datasets Archive | http://lib.stat.cmu.edu/datasets/ | ## `co2-concentration.csv` - - `description` Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. +### path +co2-concentration.csv +### description +Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. Only includes rows with valid data. - - `path` co2-concentration.csv - - `schema` - - | name | type | +### schema + +| name | type | |:-------------|:-------| | Date | date | | CO2 | number | | adjusted CO2 | number | +### sources +| title | path | +|:--------------------|:------------------------------------------------------------------------| +| Scripps CO2 Program | https://scrippsco2.ucsd.edu/data/atmospheric_co2/primary_mlo_co2_record | ## `countries.json` - - `description` This dataset combines key demographic indicators (life expectancy at birth and +### path +countries.json +### description +This dataset combines key demographic indicators (life expectancy at birth and fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year intervals. It includes both current values and adjacent time period values (previous and next) for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) notes that its philosophy is to fill data gaps with estimates and use current geographic boundaries for historical data. Gapminder states that it aims to "show people the big picture" rather than support detailed numeric analysis. - - `path` countries.json - - `schema` - - | name | type | description | +### schema + +| name | type | description | |:--------------|:--------|:-------------------------------------------------------------------------| | _comment | string | | | year | integer | Years from 1955 to 2000 at 5-year intervals | @@ -256,156 +311,237 @@ | n_fertility | number | Fertility rate for the next 5-year interval | | n_life_expect | number | Life expectancy for the next 5-year interval | | country | string | Name of the country | +### sources +| title | path | version | +|:---------------------------------------|:---------------------------------------------------------------------------------------------------------------------|----------:| +| Gapminder Foundation - Life Expectancy | https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676 | 14 | +| Gapminder Foundation - Fertility | https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676 | 14 | +### licenses +| title | path | +|:-----------------------------------------------|:-----------------------------------------| +| Creative Commons Attribution 4.0 International | https://www.gapminder.org/free-material/ | ## `crimea.json` - - `path` crimea.json - - `schema` - - | name | type | +### path +crimea.json +### schema + +| name | type | |:--------|:--------| | date | date | | wounds | integer | | other | integer | | disease | integer | ## `disasters.csv` - - `description` Annual number of deaths from disasters. - - `path` disasters.csv - - `schema` - - | name | type | +### path +disasters.csv +### description +Annual number of deaths from disasters. +### schema + +| name | type | |:-------|:--------| | Entity | string | | Year | integer | | Deaths | integer | +### sources +| title | path | +|:-----------------------------------------|:------------------------------------------------| +| Our World in Data - Natural Catastrophes | https://ourworldindata.org/natural-catastrophes | ## `driving.json` - - `path` driving.json - - `schema` - - | name | type | +### path +driving.json +### schema + +| name | type | |:-------|:--------| | side | string | | year | integer | | miles | integer | | gas | number | +### sources +| title | path | +|:---------------|:------------------------------------------------------------------------------------------| +| New York Times | https://archive.nytimes.com/www.nytimes.com/imagepages/2010/05/02/business/02metrics.html | ## `earthquakes.json` - - `description` Earthquake data retrieved Feb 6, 2018 - - `path` earthquakes.json +### path +earthquakes.json +### description +Earthquake data retrieved Feb 6, 2018 +### sources +| title | path | +|:---------------------|:---------------------------------------------------------------------------| +| USGS Earthquake Feed | https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson | ## `ffox.png` - - `description` Application icons from open-source software projects. - - `path` ffox.png +### path +ffox.png +### description +Application icons from open-source software projects. ## `flare-dependencies.json` - - `path` flare-dependencies.json - - `schema` - - | name | type | +### path +flare-dependencies.json +### schema + +| name | type | |:-------|:--------| | source | integer | | target | integer | ## `flare.json` - - `path` flare.json - - `schema` - - | name | type | +### path +flare.json +### schema + +| name | type | |:-------|:--------| | id | integer | | name | string | ## `flights-10k.json` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-10k.json - - `schema` - - | name | type | +### path +flights-10k.json +### description +Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` +### schema + +| name | type | |:------------|:--------| | date | string | | delay | integer | | distance | integer | | origin | string | | destination | string | +### sources +| title | path | +|:-----------------------------------------|:-------------------------------------------------------------------------------------| +| U.S. Bureau of Transportation Statistics | https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr | ## `flights-200k.arrow` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-200k.arrow - - `schema` - - | name | type | +### path +flights-200k.arrow +### description +Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` +### schema + +| name | type | |:---------|:--------| | delay | integer | | distance | integer | | time | number | +### sources +| title | path | +|:-----------------------------------------|:-------------------------------------------------------------------------------------| +| U.S. Bureau of Transportation Statistics | https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr | ## `flights-200k.json` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-200k.json - - `schema` - - | name | type | +### path +flights-200k.json +### description +Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` +### schema + +| name | type | |:---------|:--------| | delay | integer | | distance | integer | | time | number | +### sources +| title | path | +|:-----------------------------------------|:-------------------------------------------------------------------------------------| +| U.S. Bureau of Transportation Statistics | https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr | ## `flights-20k.json` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-20k.json - - `schema` - - | name | type | +### path +flights-20k.json +### description +Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` +### schema + +| name | type | |:------------|:--------| | date | string | | delay | integer | | distance | integer | | origin | string | | destination | string | +### sources +| title | path | +|:-----------------------------------------|:-------------------------------------------------------------------------------------| +| U.S. Bureau of Transportation Statistics | https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr | ## `flights-2k.json` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-2k.json - - `schema` - - | name | type | +### path +flights-2k.json +### description +Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` +### schema + +| name | type | |:------------|:--------| | date | string | | delay | integer | | distance | integer | | origin | string | | destination | string | +### sources +| title | path | +|:-----------------------------------------|:-------------------------------------------------------------------------------------| +| U.S. Bureau of Transportation Statistics | https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr | ## `flights-3m.parquet` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-3m.parquet - - `schema` - - | name | type | +### path +flights-3m.parquet +### description +Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` +### schema + +| name | type | |:------------|:---------| | date | datetime | | delay | integer | | distance | integer | | origin | string | | destination | string | +### sources +| title | path | +|:-----------------------------------------|:-------------------------------------------------------------------------------------| +| U.S. Bureau of Transportation Statistics | https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr | ## `flights-5k.json` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-5k.json - - `schema` - - | name | type | +### path +flights-5k.json +### description +Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` +### schema + +| name | type | |:------------|:--------| | date | string | | delay | integer | | distance | integer | | origin | string | | destination | string | +### sources +| title | path | +|:-----------------------------------------|:-------------------------------------------------------------------------------------| +| U.S. Bureau of Transportation Statistics | https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr | ## `flights-airport.csv` - - `description` Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` - - `path` flights-airport.csv - - `schema` - - | name | type | +### path +flights-airport.csv +### description +Flight delay statistics from U.S. Bureau of Transportation Statistics. Transformed using `/scripts/flights.py` +### schema + +| name | type | |:------------|:--------| | origin | string | | destination | string | | count | integer | +### sources +| title | path | +|:-----------------------------------------|:-------------------------------------------------------------------------------------| +| U.S. Bureau of Transportation Statistics | https://www.transtats.bts.gov/DL_SelectFields.asp?gnoyr_VQ=FGJ&QO_fu146_anzr=b0-gvzr | ## `football.json` - - `description` Football match outcomes across multiple divisions from 2013 to 2017, part of a +### path +football.json +### description +Football match outcomes across multiple divisions from 2013 to 2017, part of a larger dataset from OpenFootball. The subset was made such that there are records for all five chosen divisions over the time period. - - `path` football.json - - `schema` - - | name | type | +### schema + +| name | type | |:-----------|:--------| | date | date | | division | string | @@ -413,8 +549,15 @@ | away_team | string | | home_score | integer | | away_score | integer | +### sources +| title | path | +|:-------------|:----------------------------------------------| +| OpenFootball | https://github.com/openfootball/football.json | ## `gapminder-health-income.csv` - - `description` Per-capita income, life expectancy, population and regional grouping. Dataset does not specify +### path +gapminder-health-income.csv +### description +Per-capita income, life expectancy, population and regional grouping. Dataset does not specify the reference year for the data. Gapminder historical data is subject to revisions. Gapminder (v30, 2023) defines per-capita income as follows: @@ -422,18 +565,29 @@ >converted to international dollars using purchasing power parity rates. An international dollar >has the same purchasing power over GDP as the U.S. dollar has in the United States." - - `path` gapminder-health-income.csv - - `schema` - - | name | type | +### schema + +| name | type | |:-----------|:--------| | country | string | | income | integer | | health | number | | population | integer | | region | string | +### sources +| title | path | +|:------------------------------|:---------------------------------------------------------------------------------------------------------------------| +| Gapminder Foundation | https://www.gapminder.org | +| Gapminder GDP Per Capita Data | https://docs.google.com/spreadsheets/d/1i5AEui3WZNZqh7MQ4AKkJuCz4rRxGR_pw_9gtbcBOqQ/edit?gid=501532268#gid=501532268 | +### licenses +| title | path | +|:-----------------------------------------------|:-----------------------------------------| +| Creative Commons Attribution 4.0 International | https://www.gapminder.org/free-material/ | ## `gapminder.json` - - `description` This dataset combines key demographic indicators (life expectancy at birth, +### path +gapminder.json +### description +This dataset combines key demographic indicators (life expectancy at birth, population, and fertility rate measured as babies per woman) for various countries from 1955 to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable grouping countries. Gapminder's data documentation notes that its philosophy is to fill data @@ -455,10 +609,9 @@ preserve continuity with previous versions of this dataset, we have retained the column name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`. - - `path` gapminder.json - - `schema` - - | name | type | description | +### schema + +| name | type | description | |:------------|:--------|:-----------------------------------------------------------------| | year | integer | Years from 1955 to 2005 at 5-year intervals | | country | string | Name of the country | @@ -466,32 +619,55 @@ | pop | integer | Population of the country | | life_expect | number | Life expectancy in years | | fertility | number | Fertility rate (average number of children per woman | +### sources +| title | path | version | +|:---------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|----------:| +| Gapminder Foundation - Life Expectancy (Data) | https://docs.google.com/spreadsheets/d/1RehxZjXd7_rG8v2pJYV6aY0J3LAsgUPDQnbY4dRdiSs/edit?gid=176703676#gid=176703676 | 14 | +| Gapminder Foundatio - Life Expectancy (Documentation) | https://www.gapminder.org/data/documentation/gd004/ | | +| Gapminder Foundation - Population (Data) | https://docs.google.com/spreadsheets/d/1c1luQNdpH90tNbMIeU7jD__59wQ0bdIGRFpbMm8ZBTk/edit?gid=176703676#gid=176703676 | 7 | +| Gapminder Foundation - Population (Documentation) | https://www.gapminder.org/data/documentation/gd003/ | | +| Gapminder Foundation - Fertility (Data) | https://docs.google.com/spreadsheets/d/1aLtIpAWvDGGa9k2XXEz6hZugWn0wCd5nmzaRPPjbYNA/edit?gid=176703676#gid=176703676 | 14 | +| Gapminder Foundation - Fertility Documentation (Documentation) | https://www.gapminder.org/data/documentation/gd008/ | | +| Gapminder Foundation - Data Geographies (Data) | https://docs.google.com/spreadsheets/d/1qHalit8sXC0R8oVXibc2wa2gY7bkwGzOybEMTWp-08o/edit?gid=1597424158#gid=1597424158 | 2 | +| Gapminder Foundation - Data Geographies (Documentation) | https://www.gapminder.org/data/geo/ | | +| Gapminder Data Documentation | https://www.gapminder.org/data/documentation/ | | ## `gimp.png` - - `description` Application icons from open-source software projects. - - `path` gimp.png +### path +gimp.png +### description +Application icons from open-source software projects. ## `github.csv` - - `description` Generated using `/scripts/github.py`. - - `path` github.csv - - `schema` - - | name | type | +### path +github.csv +### description +Generated using `/scripts/github.py`. +### schema + +| name | type | |:-------|:--------| | time | string | | count | integer | ## `global-temp.csv` - - `description` Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023. - - `path` global-temp.csv - - `schema` - - | name | type | +### path +global-temp.csv +### description +Combined Land-Surface Air and Sea-Surface Water Temperature Anomalies (Land-Ocean Temperature Index, L-OTI), 1880-2023. +### schema + +| name | type | |:-------|:--------| | year | integer | | temp | number | +### sources +| title | path | +|:-----------------------------------------|:------------------------------------| +| NASA Goddard Institute for Space Studies | https://data.giss.nasa.gov/gistemp/ | ## `income.json` - - `path` income.json - - `schema` - - | name | type | +### path +income.json +### schema + +| name | type | |:-------|:--------| | name | string | | region | string | @@ -500,20 +676,29 @@ | total | integer | | group | string | ## `iowa-electricity.csv` - - `description` The state of Iowa has dramatically increased its production of renewable +### path +iowa-electricity.csv +### description +The state of Iowa has dramatically increased its production of renewable wind power in recent years. This file contains the annual net generation of electricity in the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. It is useful for illustrating stacked area charts. - - `path` iowa-electricity.csv - - `schema` - - | name | type | +### schema + +| name | type | |:---------------|:--------| | year | date | | source | string | | net_generation | integer | +### sources +| title | path | +|:---------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| U.S. Energy Information Administration | https://www.eia.gov/beta/electricity/data/browser/#/topic/0?agg=2,0,1&fuel=vvg&geo=00000g&sec=g&linechart=ELEC.GEN.OTH-IA-99.A~ELEC.GEN.COW-IA-99.A~ELEC.GEN.PEL-IA-99.A~ELEC.GEN.PC-IA-99.A~ELEC.GEN.NG-IA-99.A~~ELEC.GEN.NUC-IA-99.A~ELEC.GEN.HYC-IA-99.A~ELEC.GEN.AOR-IA-99.A~ELEC.GEN.HPS-IA-99.A~&columnchart=ELEC.GEN.ALL-IA-99.A&map=ELEC.GEN.ALL-IA-99.A&freq=A&start=2001&end=2017&ctype=linechart<ype=pin&tab=overview&maptype=0&rse=0&pin= | ## `jobs.json` - - `description` U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census microdata" from as early as 1790. +### path +jobs.json +### description +U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census microdata" from as early as 1790. Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). @@ -535,24 +720,29 @@ Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0 - - `path` jobs.json - - `schema` - - | name | type | description | +### schema + +| name | type | description | |:-------|:--------|:----------------------------------------------| | job | string | The occupation title | | sex | string | Sex (men/women) | | year | integer | Census year | | count | integer | Number of individuals in the occupation | | perc | number | Percentage of the workforce in the occupation | +### sources +| title | path | version | +|:----------|:---------------------------|----------:| +| IPUMS USA | https://usa.ipums.org/usa/ | 6 | ## `la-riots.csv` - - `description` More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles +### path +la-riots.csv +### description +More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic coordinates of their death. Compiled and published by the Los Angeles Times Data Desk. - - `path` la-riots.csv - - `schema` - - | name | type | +### schema + +| name | type | |:-------------|:--------| | first_name | string | | last_name | string | @@ -565,45 +755,69 @@ | type | string | | longitude | number | | latitude | number | +### sources +| title | path | +|:---------------------------------------------|:-------------------------------------------------| +| LA Riots Deaths, Los Angeles Times Data Desk | http://spreadsheets.latimes.com/la-riots-deaths/ | ## `londonboroughs.json` - - `description` Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. +### path +londonBoroughs.json +### description +Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. Original data "contains National Statistics data © Crown copyright and database right (2015)" and "Contains Ordnance Survey data © Crown copyright and database right [2015]. - - `path` londonBoroughs.json +### sources +| title | path | +|:-------------------------------------------------|:-------------------------------------------------------------------------| +| Statistical GIS Boundary Files, London Datastore | https://data.london.gov.uk/dataset/statistical-gis-boundary-files-london | ## `londoncentroids.json` - - `description` Calculated from `londongBoroughs.json` using `d3.geoCentroid`. - - `path` londonCentroids.json - - `schema` - - | name | type | +### path +londonCentroids.json +### description +Calculated from `londongBoroughs.json` using `d3.geoCentroid`. +### schema + +| name | type | |:-------|:-------| | name | string | | cx | number | | cy | number | ## `londontubelines.json` - - `description` Selected rail lines simplified from source. - - `path` londonTubeLines.json +### path +londonTubeLines.json +### description +Selected rail lines simplified from source. +### sources +| title | path | +|:-----------------|:-----------------------------------------------------| +| London Tube Data | https://github.com/oobrien/vis/tree/master/tube/data | ## `lookup_groups.csv` - - `path` lookup_groups.csv - - `schema` - - | name | type | +### path +lookup_groups.csv +### schema + +| name | type | |:-------|:--------| | group | integer | | person | string | ## `lookup_people.csv` - - `path` lookup_people.csv - - `schema` - - | name | type | +### path +lookup_people.csv +### schema + +| name | type | |:-------|:--------| | name | string | | age | integer | | height | integer | ## `miserables.json` - - `path` miserables.json +### path +miserables.json ## `monarchs.json` - - `description` A chronological list of English and British monarchs from Elizabeth I through George IV. +### path +monarchs.json +### description +A chronological list of English and British monarchs from Elizabeth I through George IV. Each entry includes: The dataset contains two intentional inaccuracies to maintain compatibility with @@ -621,22 +835,28 @@ Content on the site is protected by Crown Copyright. Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). - - `path` monarchs.json - - `schema` - - | name | type | description | +### schema + +| name | type | description | |:-------|:--------|:------------------------------------------------------------------------------------------------------------| | name | string | The ruler's name or identifier (e.g., "W&M" for William and Mary, "Cromwell" for the period of interregnum) | | start | integer | The year their rule began | | end | integer | The year their rule ended | | index | integer | A zero-based sequential number assigned to each entry, representing the chronological order of rulers | +### sources +| title | path | +|:----------------------------------|:-------------------------------------------| +| The Royal Family - Kings & Queens | https://www.royal.uk/kings-and-queens-1066 | +| The Royal Family - Interregnum | https://www.royal.uk/interregnum-1649-1660 | ## `movies.json` - - `description` The dataset has well known and intentionally included errors. +### path +movies.json +### description +The dataset has well known and intentionally included errors. This dataset is provided for instructional purposes, including the need to reckon with dirty data. - - `path` movies.json - - `schema` - - | name | type | +### schema + +| name | type | |:-----------------------|:--------| | Title | string | | US Gross | integer | @@ -655,30 +875,34 @@ | IMDB Rating | number | | IMDB Votes | integer | ## `normal-2d.json` - - `path` normal-2d.json - - `schema` - - | name | type | +### path +normal-2d.json +### schema + +| name | type | |:-------|:-------| | u | number | | v | number | ## `obesity.json` - - `path` obesity.json - - `schema` - - | name | type | +### path +obesity.json +### schema + +| name | type | |:-------|:--------| | id | integer | | rate | number | | state | string | ## `ohlc.json` - - `description` This dataset contains the performance of the Chicago Board Options Exchange +### path +ohlc.json +### description +This dataset contains the performance of the Chicago Board Options Exchange [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/ %5EVIX#overview)) in the summer of 2009. - - `path` ohlc.json - - `schema` - - | name | type | +### schema + +| name | type | |:-------|:-------| | date | date | | open | number | @@ -687,15 +911,21 @@ | close | number | | signal | string | | ret | number | +### sources +| title | path | +|:-----------------------|:---------------------------------------| +| Yahoo Finance VIX Data | https://finance.yahoo.com/chart/%5EVIX | ## `penguins.json` - - `description` Palmer Archipelago (Antarctica) penguin data collected and made available by +### path +penguins.json +### description +Palmer Archipelago (Antarctica) penguin data collected and made available by [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research Network](https://lternet.edu/). - - `path` penguins.json - - `schema` - - | name | type | +### schema + +| name | type | |:--------------------|:--------| | Species | string | | Island | string | @@ -704,12 +934,19 @@ | Flipper Length (mm) | integer | | Body Mass (g) | integer | | Sex | string | +### sources +| title | path | +|:------------------------------------|:-----------------------------------------| +| Palmer Station Antarctica LTER | https://pal.lternet.edu/ | +| Allison Horst's Penguins Repository | https://github.com/allisonhorst/penguins | ## `platformer-terrain.json` - - `description` Assets from the video game Celeste. - - `path` platformer-terrain.json - - `schema` - - | name | type | +### path +platformer-terrain.json +### description +Assets from the video game Celeste. +### schema + +| name | type | |:-----------|:--------| | x | integer | | y | integer | @@ -719,16 +956,24 @@ | id | string | | color | string | | key | string | +### sources +| title | path | +|:-------------|:----------------------------| +| Celeste Game | http://www.celestegame.com/ | ## `points.json` - - `path` points.json - - `schema` - - | name | type | +### path +points.json +### schema + +| name | type | |:-------|:-------| | x | number | | y | number | ## `political-contributions.json` - - `description` Summary financial information on contributions to candidates for U.S. +### path +political-contributions.json +### description +Summary financial information on contributions to candidates for U.S. elections. An updated version of this datset is available from the "all candidates" files (in pipe-delimited format) on the bulk data download page of the U.S. Federal Election Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is @@ -747,10 +992,9 @@ > A few restrictions limit the way you can use FEC data. For example, you can't use > contributor lists for commercial purposes or to solicit donations. Learn more on > [FEC.gov](https://www.fec.gov/). - - `path` political-contributions.json - - `schema` - - | name | type | +### schema + +| name | type | |:----------------------------------------------|:--------| | Candidate_Identification | string | | Candidate_Name | string | @@ -777,8 +1021,16 @@ | Coverage_End_Date | string | | Refunds_to_Individuals | integer | | Refunds_to_Committees | integer | +### sources +| title | path | +|:--------------------------------------|:----------------------------------------------------| +| Federal Election Commission Bulk Data | https://www.fec.gov/data/browse-data/?tab=bulk-data | +| OpenFEC API | https://api.open.fec.gov/developers/ | ## `population.json` - - `description` United States population statistics by sex and age group across decades between 1850 and 2000. +### path +population.json +### description +United States population statistics by sex and age group across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census microdata" from as early as 1790. @@ -791,51 +1043,70 @@ Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0 - - `path` population.json - - `schema` - - | name | type | description | +### schema + +| name | type | description | |:-------|:--------|:--------------------------------------------------------------------| | year | integer | Four-digit year of the survey | | age | integer | Age group in 5-year intervals (0=0-4, 5=5-9, 10=10-14, ..., 90=90+) | | sex | integer | Sex (1=men, 2=women) | | people | integer | Number of individuals (IPUMS PERWT) | +### sources +| title | path | +|:----------|:---------------------------| +| IPUMS USA | https://usa.ipums.org/usa/ | ## `population_engineers_hurricanes.csv` - - `description` Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, +### path +population_engineers_hurricanes.csv +### description +Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, [Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html) - - `path` population_engineers_hurricanes.csv - - `schema` - - | name | type | +### schema + +| name | type | |:-----------|:--------| | state | string | | id | integer | | population | integer | | engineers | number | | hurricanes | integer | +### sources +| title | path | +|:-----------------------------------|:------------------------------------------------------------------------------------------------------------------| +| Bureau of Labor Statistics | https://www.bls.gov/oes/tables.htm | +| American Community Survey | https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=ACS_07_3YR_S1901&prodType=table | +| NOAA National Climatic Data Center | https://www.ncdc.noaa.gov/cdo-web/datatools/records | ## `seattle-weather-hourly-normals.csv` - - `description` Hourly weather normals with metric units. The 1981-2010 Climate Normals are +### path +seattle-weather-hourly-normals.csv +### description +Hourly weather normals with metric units. The 1981-2010 Climate Normals are NCDC's three-decade averages of climatological variables, including temperature and precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). We only included temperature, wind, and pressure and updated the format to be easier to parse. - - `path` seattle-weather-hourly-normals.csv - - `schema` - - | name | type | +### schema + +| name | type | |:------------|:---------| | date | datetime | | pressure | number | | temperature | number | | wind | number | +### sources +| title | path | +|:------------------------------------------|:----------------------------------------------------| +| NOAA National Climatic Data Center (NCDC) | https://www.ncdc.noaa.gov/cdo-web/datatools/normals | ## `seattle-weather.csv` - - `description` Daily weather records with metric units. Transformed using `/scripts/weather.py`. +### path +seattle-weather.csv +### description +Daily weather records with metric units. Transformed using `/scripts/weather.py`. The categorical "weather" field is synthesized from multiple fields in the original dataset. This data is intended for instructional purposes. - - `path` seattle-weather.csv - - `schema` - - | name | type | +### schema + +| name | type | |:--------------|:-------| | date | date | | precipitation | number | @@ -843,12 +1114,18 @@ | temp_min | number | | wind | number | | weather | string | +### sources +| title | path | +|:-----------------------------------|:----------------------------------------------------| +| NOAA National Climatic Data Center | https://www.ncdc.noaa.gov/cdo-web/datatools/records | ## `sp500-2000.csv` - - `description` S&P 500 index values from 2000 to 2020. - - `path` sp500-2000.csv - - `schema` - - | name | type | +### path +sp500-2000.csv +### description +S&P 500 index values from 2000 to 2020. +### schema + +| name | type | |:---------|:--------| | date | date | | open | number | @@ -857,33 +1134,43 @@ | close | number | | adjclose | number | | volume | integer | +### sources +| title | path | +|:--------------|:------------------------------------------------| +| Yahoo Finance | https://finance.yahoo.com/quote/%5EDJI/history/ | ## `sp500.csv` - - `path` sp500.csv - - `schema` - - | name | type | +### path +sp500.csv +### schema + +| name | type | |:-------|:-------| | date | string | | price | number | ## `stocks.csv` - - `path` stocks.csv - - `schema` - - | name | type | +### path +stocks.csv +### schema + +| name | type | |:-------|:-------| | symbol | string | | date | string | | price | number | ## `udistrict.json` - - `path` udistrict.json - - `schema` - - | name | type | +### path +udistrict.json +### schema + +| name | type | |:-------|:-------| | key | string | | lat | number | ## `unemployment-across-industries.json` - - `description` Industry-level unemployment statistics from the Current Population Survey +### path +unemployment-across-industries.json +### description +Industry-level unemployment statistics from the Current Population Survey (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons and unemployment rate across 11 private industries, as well as agricultural, government, and self-employed workers. Covers January 2000 through February 2010. Industry classification @@ -900,10 +1187,9 @@ > that are not BLS-sponsored." See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm). - - `path` unemployment-across-industries.json - - `schema` - - | name | type | description | +### schema + +| name | type | description | |:-------|:---------|:------------------------------------------------------------------| | series | string | Industry name | | year | integer | Year (2000-2010) | @@ -911,8 +1197,17 @@ | count | integer | Number of unemployed persons (in thousands) | | rate | number | Unemployment rate (percentage) | | date | datetime | ISO 8601-formatted date string (e.g., "2000-01-01T08:00:00.000Z") | +### sources +| title | path | +|:---------------------------------------------|:-------------------------------------------------| +| U.S. Census Bureau Current Population Survey | https://www.census.gov/programs-surveys/cps.html | +| BLS LAUS Data Tools | https://www.bls.gov/lau/data.htm | +| Bureau of Labor Statistics Table A-31 | https://www.bls.gov/web/empsit/cpseea31.htm | ## `unemployment.tsv` - - `description` This dataset contains county-level unemployment rates in the United States, with data generally +### path +unemployment.tsv +### description +This dataset contains county-level unemployment rates in the United States, with data generally consistent with levels reported in 2009. The dataset is structured as tab-separated values. The unemployment rate represents the number of unemployed persons as a percentage of the labor force. According to the Bureau of Labor Statistics (BLS) glossary: @@ -938,25 +1233,34 @@ - API documentation and examples are available on the BLS Developers page. When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm). - - `path` unemployment.tsv - - `schema` - - | name | type | description | +### schema + +| name | type | description | |:-------|:--------|:----------------------------------------| | id | integer | The combined state and county FIPS code | | rate | number | The unemployment rate for the county | +### sources +| title | path | +|:------------------------|:------------------------------------------| +| BLS Developers API | https://www.bls.gov/developers/ | +| BLS Handbook of Methods | https://www.bls.gov/opub/hom/lau/home.htm | ## `uniform-2d.json` - - `path` uniform-2d.json - - `schema` - - | name | type | +### path +uniform-2d.json +### schema + +| name | type | |:-------|:-------| | u | number | | v | number | ## `us-10m.json` - - `path` us-10m.json +### path +us-10m.json ## `us-employment.csv` - - `description` In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job +### path +us-employment.csv +### description +In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job losses across the United States. The downturn in employment, and the slow recovery in hiring that followed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau of Labor Statistics. @@ -973,10 +1277,9 @@ supersector's employment. It is useful for illustrating how to make bar charts that report both negative and positive values. - - `path` us-employment.csv - - `schema` - - | name | type | +### schema + +| name | type | |:-----------------------------------|:--------| | month | date | | nonfarm | integer | @@ -1002,28 +1305,41 @@ | other_services | integer | | government | integer | | nonfarm_change | integer | +### sources +| title | path | +|:--------------------------------------------------------------|:-------------------------| +| U.S. Bureau of Labor Statistics Current Employment Statistics | https://www.bls.gov/ces/ | ## `us-state-capitals.json` - - `path` us-state-capitals.json - - `schema` - - | name | type | +### path +us-state-capitals.json +### schema + +| name | type | |:-------|:-------| | lon | number | | lat | number | | state | string | | city | string | ## `volcano.json` - - `description` Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. +### path +volcano.json +### description +Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate. - - `path` volcano.json +### sources +| title | path | +|:-----------|:---------------------------------------------------------------------------| +| R Datasets | https://stat.ethz.ch/R-manual/R-patched/library/datasets/html/volcano.html | ## `weather.csv` - - `description` NOAA data transformed using `/scripts/weather.py`. Categorical "weather" field synthesized +### path +weather.csv +### description +NOAA data transformed using `/scripts/weather.py`. Categorical "weather" field synthesized from multiple fields in the original dataset. This data is intended for instructional purposes. - - `path` weather.csv - - `schema` - - | name | type | +### schema + +| name | type | |:--------------|:-------| | location | string | | date | date | @@ -1032,29 +1348,43 @@ | temp_min | number | | wind | number | | weather | string | +### sources +| title | path | +|:-------------------------|:-------------------------------------------------------| +| NOAA Climate Data Online | http://www.ncdc.noaa.gov/cdo-web/datatools/findstation | ## `weather.json` - - `description` Instructional dataset showing actual and predicted temperature data. - - `path` weather.json +### path +weather.json +### description +Instructional dataset showing actual and predicted temperature data. ## `wheat.json` - - `description` In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), +### path +wheat.json +### description +In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published an elegant chart on the price of wheat. It plots 250 years of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that "never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time." - - `path` wheat.json - - `schema` - - | name | type | +### schema + +| name | type | |:-------|:--------| | year | integer | | wheat | number | | wages | number | +### sources +| title | path | +|:--------------------|:----------------------------------------------------------------------| +| 1822 Playfair Chart | http://dh101.humanities.ucla.edu/wp-content/uploads/2014/08/Vis_2.jpg | ## `windvectors.csv` - - `description` Simulated wind patterns over northwestern Europe. - - `path` windvectors.csv - - `schema` - - | name | type | +### path +windvectors.csv +### description +Simulated wind patterns over northwestern Europe. +### schema + +| name | type | |:----------|:--------| | longitude | number | | latitude | number | @@ -1062,17 +1392,24 @@ | dirCat | integer | | speed | number | ## `world-110m.json` - - `path` world-110m.json +### path +world-110m.json ## `zipcodes.csv` - - `description` GeoNames.org - - `path` zipcodes.csv - - `schema` - - | name | type | +### path +zipcodes.csv +### description +GeoNames.org +### schema + +| name | type | |:----------|:--------| | zip_code | integer | | latitude | number | | longitude | number | | city | string | | state | string | -| county | string | \ No newline at end of file +| county | string | +### sources +| title | path | +|:---------|:-------------------------| +| GeoNames | https://www.geonames.org | \ No newline at end of file diff --git a/datapackage.json b/datapackage.json index 7db7733b..d0110cea 100644 --- a/datapackage.json +++ b/datapackage.json @@ -16,7 +16,7 @@ } ], "version": "2.11.0", - "created": "2024-12-13T12:53:03.887410+00:00", + "created": "2024-12-13T18:29:45.637443+00:00", "resources": [ { "name": "7zip.png", From f4c84f7c558d127ccdc5eed40700c1780653a33c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 18:39:01 +0000 Subject: [PATCH 28/40] fix: unpack both sequences during merge --- scripts/build_datapackage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index c223da07..445ca8b3 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -423,7 +423,7 @@ def _merge_package_metadata( logger.warning(msg, stacklevel=2) changes[k] = extra elif isinstance(item, Sequence): - changes[k] = [*item, extra] + changes[k] = [*item, *extra] else: msg = ( f"Expected only lists of mappings or single values, " From 6c6e575a937fefded87b572f4ed171fd396ee826 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 18:40:22 +0000 Subject: [PATCH 29/40] feat: add link to repo contributors https://github.com/vega/vega-datasets/issues/634#issuecomment-2510516268 --- _data/datapackage_additions.toml | 4 ++++ datapackage-tabular.md | 9 +++++---- datapackage.json | 6 +++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/_data/datapackage_additions.toml b/_data/datapackage_additions.toml index 55269912..3d82bc28 100644 --- a/_data/datapackage_additions.toml +++ b/_data/datapackage_additions.toml @@ -3,6 +3,10 @@ name = "BSD-3-Clause" path = "https://opensource.org/license/bsd-3-clause" title = "The 3-Clause BSD License" +[[contributors]] +title = "vega-datasets contributors" +path = "https://github.com/vega/vega-datasets/graphs/contributors" + [[resources]] # Path: 7zip.png path = "7zip.png" description = """Application icons from open-source software projects.""" diff --git a/datapackage-tabular.md b/datapackage-tabular.md index f6787e06..85a545f2 100644 --- a/datapackage-tabular.md +++ b/datapackage-tabular.md @@ -1,5 +1,5 @@ # vega-datasets -`2.11.0` | ([GitHub](http://github.com/vega/vega-datasets.git)) | (2024-12-13T18:29:45.637443+00:00) +`2.11.0` | ([GitHub](http://github.com/vega/vega-datasets.git)) | (2024-12-13T18:37:59.958722+00:00) Common repository for example datasets used by Vega related projects. @@ -9,9 +9,10 @@ Common repository for example datasets used by Vega related projects. | BSD-3-Clause | https://opensource.org/license/bsd-3-clause | The 3-Clause BSD License | ## contributors -| title | path | -|:------------------------|:-----------------------------| -| UW Interactive Data Lab | http://idl.cs.washington.edu | +| title | path | +|:---------------------------|:----------------------------------------------------------| +| UW Interactive Data Lab | http://idl.cs.washington.edu | +| vega-datasets contributors | https://github.com/vega/vega-datasets/graphs/contributors | # resources ## `7zip.png` diff --git a/datapackage.json b/datapackage.json index d0110cea..f94b6ba0 100644 --- a/datapackage.json +++ b/datapackage.json @@ -13,10 +13,14 @@ { "title": "UW Interactive Data Lab", "path": "http://idl.cs.washington.edu" + }, + { + "title": "vega-datasets contributors", + "path": "https://github.com/vega/vega-datasets/graphs/contributors" } ], "version": "2.11.0", - "created": "2024-12-13T18:29:45.637443+00:00", + "created": "2024-12-13T18:37:59.958722+00:00", "resources": [ { "name": "7zip.png", From 6148b8b489640bb81d63ce2550d85e25f3f511f0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 18:46:56 +0000 Subject: [PATCH 30/40] fix: correct typo, add link Resolves (https://github.com/vega/vega-datasets/pull/643#discussion_r1884029008) --- _data/datapackage_additions.toml | 2 +- datapackage-tabular.md | 4 ++-- datapackage.json | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/_data/datapackage_additions.toml b/_data/datapackage_additions.toml index 3d82bc28..56665c7a 100644 --- a/_data/datapackage_additions.toml +++ b/_data/datapackage_additions.toml @@ -534,7 +534,7 @@ path = "https://data.london.gov.uk/dataset/statistical-gis-boundary-files-londo [[resources]] # Path: londonCentroids.json path = "londonCentroids.json" -description = """Calculated from `londongBoroughs.json` using `d3.geoCentroid`.""" +description = """Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid).""" [[resources]] # Path: londonTubeLines.json path = "londonTubeLines.json" diff --git a/datapackage-tabular.md b/datapackage-tabular.md index 85a545f2..5cc44972 100644 --- a/datapackage-tabular.md +++ b/datapackage-tabular.md @@ -1,5 +1,5 @@ # vega-datasets -`2.11.0` | ([GitHub](http://github.com/vega/vega-datasets.git)) | (2024-12-13T18:37:59.958722+00:00) +`2.11.0` | ([GitHub](http://github.com/vega/vega-datasets.git)) | (2024-12-13T18:45:28.671592+00:00) Common repository for example datasets used by Vega related projects. @@ -775,7 +775,7 @@ Boundaries of London boroughs reprojected and simplified from `London_Borough_Ex ### path londonCentroids.json ### description -Calculated from `londongBoroughs.json` using `d3.geoCentroid`. +Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid). ### schema | name | type | diff --git a/datapackage.json b/datapackage.json index f94b6ba0..248ae109 100644 --- a/datapackage.json +++ b/datapackage.json @@ -20,7 +20,7 @@ } ], "version": "2.11.0", - "created": "2024-12-13T18:37:59.958722+00:00", + "created": "2024-12-13T18:45:28.671592+00:00", "resources": [ { "name": "7zip.png", @@ -1762,7 +1762,7 @@ { "name": "londoncentroids.json", "type": "table", - "description": "Calculated from `londongBoroughs.json` using `d3.geoCentroid`.", + "description": "Calculated from `londonBoroughs.json` using [`d3.geoCentroid`](https://d3js.org/d3-geo/math#geoCentroid).", "path": "londonCentroids.json", "scheme": "file", "format": "json", From 6cbe84fa119e2aa69e50a1a4fa7a052d9b373488 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 21:08:23 +0000 Subject: [PATCH 31/40] style: Adjust link, date format for markdown --- _data/templates/package-table.md | 2 +- datapackage-tabular.md | 2 +- datapackage.json | 2 +- scripts/build_datapackage.py | 14 ++++++++++++++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/_data/templates/package-table.md b/_data/templates/package-table.md index 0daee636..68fb8918 100644 --- a/_data/templates/package-table.md +++ b/_data/templates/package-table.md @@ -1,5 +1,5 @@ # {{ package.name }} -`{{ package.version }}` | ([GitHub]({{ package.homepage }})) | ({{ package.created }}) +`{{ package.version }}` | [GitHub]({{ package.homepage }}) | {{ package.created | fmt_date()}} {{ package.description }} diff --git a/datapackage-tabular.md b/datapackage-tabular.md index 5cc44972..b85923a0 100644 --- a/datapackage-tabular.md +++ b/datapackage-tabular.md @@ -1,5 +1,5 @@ # vega-datasets -`2.11.0` | ([GitHub](http://github.com/vega/vega-datasets.git)) | (2024-12-13T18:45:28.671592+00:00) +`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-13 21:05:54 [UTC] Common repository for example datasets used by Vega related projects. diff --git a/datapackage.json b/datapackage.json index 248ae109..8de71218 100644 --- a/datapackage.json +++ b/datapackage.json @@ -20,7 +20,7 @@ } ], "version": "2.11.0", - "created": "2024-12-13T18:45:28.671592+00:00", + "created": "2024-12-13T21:05:54.344553+00:00", "resources": [ { "name": "7zip.png", diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index 445ca8b3..60661f62 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -150,6 +150,19 @@ ) +def fmt_date(s: str, /) -> str: + """ + Reformat `package.created` at a lower resolution. + + Use a friendlier date format for markdown. + """ + datetime = dt.datetime.fromisoformat(s) + dt_fmt = datetime.replace(tzinfo=None).isoformat(sep=" ", timespec="seconds") + if tzname := datetime.tzname(): + return f"{dt_fmt} [{tzname}]" + return dt_fmt + + def render_markdown_patch(path: str, data: dict[str, Any]) -> str: """ Patch to `frictionless.formats.markdown.mapper.render_markdown`_ to support template overrides. @@ -182,6 +195,7 @@ def render_markdown_patch(path: str, data: dict[str, Any]) -> str: environ.filters["filter_dict"] = fl_markdown.filter_dict environ.filters["dict_to_markdown"] = fl_markdown.dict_to_markdown environ.filters["tabulate"] = fl_markdown.dicts_to_markdown_table + environ.filters["fmt_date"] = fmt_date template = environ.get_template(path) return template.render(**data) From ed39948fdfa3688bf317acbe331fabbda69a9a5b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 21:18:44 +0000 Subject: [PATCH 32/40] fix: remove unneeded indent I think this was in the original due to everything being in a bullet list. No longer relevant now --- _data/templates/resource-table.md | 2 +- datapackage-tabular.md | 368 +++++++++++++++--------------- datapackage.json | 2 +- 3 files changed, 186 insertions(+), 186 deletions(-) diff --git a/_data/templates/resource-table.md b/_data/templates/resource-table.md index 9290060c..78931bd6 100644 --- a/_data/templates/resource-table.md +++ b/_data/templates/resource-table.md @@ -6,7 +6,7 @@ {% endif %} {% if resource.description %} ### description -{{ resource.description | indent(4, False) }} +{{ resource.description }} {% endif %} {% if resource.schema %} ### schema diff --git a/datapackage-tabular.md b/datapackage-tabular.md index b85923a0..bf9bb72a 100644 --- a/datapackage-tabular.md +++ b/datapackage-tabular.md @@ -1,5 +1,5 @@ # vega-datasets -`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-13 21:05:54 [UTC] +`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-13 21:17:21 [UTC] Common repository for example datasets used by Vega related projects. @@ -61,11 +61,11 @@ barley.json ### description The result of a 1930s agricultural experiment in Minnesota, this dataset contains yields for 10 different varieties of barley at six different sites. - It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper "Statistical Determination of Barley Varietal Adaption". +It was first published by agronomists F.R. Immer, H.K. Hayes, and L. Powers in the 1934 paper "Statistical Determination of Barley Varietal Adaption". - R.A. Fisher's popularized its use in the field of statistics when he included it in his book "The Design of Experiments". +R.A. Fisher's popularized its use in the field of statistics when he included it in his book "The Design of Experiments". - Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s. +Since then it has been used to demonstrate new statistical techniques, including the trellis charts developed by Richard Becker, William Cleveland and others in the 1990s. ### schema @@ -208,33 +208,33 @@ burtin.json ### description The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine. - The dataset compares the performance of three antibiotics against 16 different bacteria. +The dataset compares the performance of three antibiotics against 16 different bacteria. - Numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. +Numerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness. - The dataset was featured as an example in the Protovis project, a precursor to D3.js. +The dataset was featured as an example in the Protovis project, a precursor to D3.js. - As noted in the Protovis example, "Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin". +As noted in the Protovis example, "Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin". - The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. +The vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together. - The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) - reads as follows: +The caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) +reads as follows: - > ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin - > - > - > The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in > red and gram- in blue) with their sensitivities to penicillin, and streptomycin. - > - > The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits > the test organism. - > - > High dilutions are toward the periphery; consequently the length of the colored bar is proportional > to the effectiveness. - > - > It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. > fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. > vulgaris, S. schottmuelleri and M. tuberculosis. - > - > Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to > neomycin, although the majority of these are sensitive to neomycin. - > - > It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood. +> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin +> +> +> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in > red and gram- in blue) with their sensitivities to penicillin, and streptomycin. +> +> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits > the test organism. +> +> High dilutions are toward the periphery; consequently the length of the colored bar is proportional > to the effectiveness. +> +> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. > fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. > vulgaris, S. schottmuelleri and M. tuberculosis. +> +> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to > neomycin, although the majority of these are sensitive to neomycin. +> +> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood. ### schema @@ -278,7 +278,7 @@ Collection of car specifications and performance metrics from various automobile co2-concentration.csv ### description Scripps CO2 program data ut modified to only include date, CO2, seasonally adjusted CO2. - Only includes rows with valid data. +Only includes rows with valid data. ### schema | name | type | @@ -295,12 +295,12 @@ Scripps CO2 program data ut modified to only include date, CO2, seasonally adjus countries.json ### description This dataset combines key demographic indicators (life expectancy at birth and - fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year - intervals. It includes both current values and adjacent time period values (previous and next) - for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) - notes that its philosophy is to fill data gaps with estimates and use current - geographic boundaries for historical data. Gapminder states that it aims to "show people the - big picture" rather than support detailed numeric analysis. +fertility rate measured as babies per woman) for various countries from 1955 to 2000 at 5-year +intervals. It includes both current values and adjacent time period values (previous and next) +for each indicator. Gapminder's [data documentation](https://www.gapminder.org/data/documentation/) +notes that its philosophy is to fill data gaps with estimates and use current +geographic boundaries for historical data. Gapminder states that it aims to "show people the +big picture" rather than support detailed numeric analysis. ### schema | name | type | description | @@ -538,8 +538,8 @@ Flight delay statistics from U.S. Bureau of Transportation Statistics. Transform football.json ### description Football match outcomes across multiple divisions from 2013 to 2017, part of a - larger dataset from OpenFootball. The subset was made such that there are records for all five - chosen divisions over the time period. +larger dataset from OpenFootball. The subset was made such that there are records for all five +chosen divisions over the time period. ### schema | name | type | @@ -559,12 +559,12 @@ Football match outcomes across multiple divisions from 2013 to 2017, part of a gapminder-health-income.csv ### description Per-capita income, life expectancy, population and regional grouping. Dataset does not specify - the reference year for the data. Gapminder historical data is subject to revisions. +the reference year for the data. Gapminder historical data is subject to revisions. - Gapminder (v30, 2023) defines per-capita income as follows: - >"This is real GDP per capita (gross domestic product per person adjusted for inflation) - >converted to international dollars using purchasing power parity rates. An international dollar - >has the same purchasing power over GDP as the U.S. dollar has in the United States." +Gapminder (v30, 2023) defines per-capita income as follows: +>"This is real GDP per capita (gross domestic product per person adjusted for inflation) +>converted to international dollars using purchasing power parity rates. An international dollar +>has the same purchasing power over GDP as the U.S. dollar has in the United States." ### schema @@ -589,27 +589,27 @@ Per-capita income, life expectancy, population and regional grouping. Dataset do gapminder.json ### description This dataset combines key demographic indicators (life expectancy at birth, - population, and fertility rate measured as babies per woman) for various countries from 1955 - to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable - grouping countries. Gapminder's data documentation notes that its philosophy is to fill data - gaps with estimates and use current geographic boundaries for historical data. Gapminder - states that it aims to "show people the big picture" rather than support detailed numeric - analysis. +population, and fertility rate measured as babies per woman) for various countries from 1955 +to 2005 at 5-year intervals. It also includes a 'cluster' column, a categorical variable +grouping countries. Gapminder's data documentation notes that its philosophy is to fill data +gaps with estimates and use current geographic boundaries for historical data. Gapminder +states that it aims to "show people the big picture" rather than support detailed numeric +analysis. - Notes: - 1. Country Selection: The set of countries in this file matches the version of this dataset - originally added to this collection in 2015. The specific criteria for country selection - in that version are not known. Data for Aruba are no longer available in the new version. - Hong Kong has been revised to Hong Kong, China in the new version. +Notes: +1. Country Selection: The set of countries in this file matches the version of this dataset + originally added to this collection in 2015. The specific criteria for country selection + in that version are not known. Data for Aruba are no longer available in the new version. + Hong Kong has been revised to Hong Kong, China in the new version. - 2. Data Precision: The precision of float values may have changed from the original version. - These changes reflect the most recent source data used for each indicator. +2. Data Precision: The precision of float values may have changed from the original version. + These changes reflect the most recent source data used for each indicator. - 3. Regional Groupings: The 'cluster' column represents a regional mapping of countries - corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To - preserve continuity with previous versions of this dataset, we have retained the column - name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: - `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`. +3. Regional Groupings: The 'cluster' column represents a regional mapping of countries + corresponding to the 'six_regions' schema in Gapminder's Data Geographies dataset. To + preserve continuity with previous versions of this dataset, we have retained the column + name 'cluster' instead of renaming it to 'six_regions'. The six regions represented are: + `0: south_asia, 1: europe_central_asia, 2: sub_saharan_africa, 3: america, 4: east_asia_pacific, 5: middle_east_north_africa`. ### schema | name | type | description | @@ -681,9 +681,9 @@ income.json iowa-electricity.csv ### description The state of Iowa has dramatically increased its production of renewable - wind power in recent years. This file contains the annual net generation of electricity in - the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. - It is useful for illustrating stacked area charts. +wind power in recent years. This file contains the annual net generation of electricity in +the state by source in thousand megawatthours. U.S. EIA data downloaded on May 6, 2018. +It is useful for illustrating stacked area charts. ### schema | name | type | @@ -701,25 +701,25 @@ jobs.json ### description U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC1950#codes_section) by sex and year across decades between 1850 and 2000. The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census microdata" from as early as 1790. - Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). - The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). +Originally created for a 2006 data visualization project called *sense.us* by IBM Research (Jeff Heer, Martin Wattenberg and Fernanda Viégas), described [here](https://homes.cs.washington.edu/~jheer/files/bdata_ch12.pdf). +The dataset is also referenced in this vega [example](https://vega.github.io/vega/examples/job-voyager/). - Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). +Data is based on a tabulation of the [OCC1950](https://usa.ipums.org/usa-action/variables/OCC1950) variable by sex across IPUMS USA samples. The dataset appears to be derived from Version 6.0 (2015) of IPUMS USA, according to 2024 correspondence with the IPUMS Project. IPUMS has made improvements to occupation coding since version 6, particularly for 19th-century samples, which may result in discrepancies between this dataset and current IPUMS data. Details on data revisions are available [here](https://usa.ipums.org/usa-action/revisions). - IPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating: - >We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared. +IPUMS USA confirmed in 2024 correspondence that hosting this dataset on vega-datasets is permissible, stating: +>We're excited to hear that this dataset made its way to this repository and is being used by students for data visualization. We allow for these types of redistributions of summary data so long as the underlying microdata records are not shared. - This dataset contains only summary statistics and does not include any underlying microdata records. +This dataset contains only summary statistics and does not include any underlying microdata records. - 1. This dataset represents summary data. The underlying microdata records are not included. - 2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) - (person weight) variable as an expansion factor when working with IPUMS USA extracts. - 3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly. +1. This dataset represents summary data. The underlying microdata records are not included. +2. Users attempting to replicate or extend this data should use the [PERWT](https://usa.ipums.org/usa-action/variables/PERWT#description_section) +(person weight) variable as an expansion factor when working with IPUMS USA extracts. +3. Due to coding revisions, figures for earlier years (particularly 19th century) may not match current IPUMS USA data exactly. - When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). - The organization requests use of the following citation for this json file: +When using this dataset, please refer to IPUMS USA [terms of use](https://usa.ipums.org/usa/terms.shtml). +The organization requests use of the following citation for this json file: - Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0 +Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. http://doi.org/10.18128/D010.V6.0 ### schema @@ -739,8 +739,8 @@ U.S. census data on [occupations](https://usa.ipums.org/usa-action/variables/OCC la-riots.csv ### description More than 60 people lost their lives amid the looting and fires that ravaged Los Angeles - for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic - coordinates of their death. Compiled and published by the Los Angeles Times Data Desk. +for five days starting on April 29, 1992. This file contains metadata about each person, including the geographic +coordinates of their death. Compiled and published by the Los Angeles Times Data Desk. ### schema | name | type | @@ -765,8 +765,8 @@ More than 60 people lost their lives amid the looting and fires that ravaged Los londonBoroughs.json ### description Boundaries of London boroughs reprojected and simplified from `London_Borough_Excluding_MHW` shapefile. - Original data "contains National Statistics data © Crown copyright and database right (2015)" - and "Contains Ordnance Survey data © Crown copyright and database right [2015]. +Original data "contains National Statistics data © Crown copyright and database right (2015)" +and "Contains Ordnance Survey data © Crown copyright and database right [2015]. ### sources | title | path | |:-------------------------------------------------|:-------------------------------------------------------------------------| @@ -819,23 +819,23 @@ miserables.json monarchs.json ### description A chronological list of English and British monarchs from Elizabeth I through George IV. - Each entry includes: +Each entry includes: - The dataset contains two intentional inaccuracies to maintain compatibility with - the [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization: - 1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558; - 2. the end date for the reign of George IV is shown as 1820, instead of 1830. - These discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization. - The entry "W&M" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, - the official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702. - The `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, - and the period leading to the Restoration. While historically more accurate to call this the "interregnum," the field name of `commonwealth` - from the original dataset is retained for backwards compatibility. - The dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689). - Source data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024). - Content on the site is protected by Crown Copyright. - Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most - Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). +The dataset contains two intentional inaccuracies to maintain compatibility with +the [Wheat and Wages](https://vega.github.io/vega/examples/wheat-and-wages/) example visualization: +1. the start date for the reign of Elizabeth I is shown as 1565, instead of 1558; +2. the end date for the reign of George IV is shown as 1820, instead of 1830. +These discrepancies align the `monarchs.json` dataset with the start and end dates of the `wheat.json` dataset used i the visualization. +The entry "W&M" represents the joint reign of William III and Mary II. While the dataset shows their reign as 1689-1702, +the official Web site of the British royal family indicates that Mary II's reign ended in 1694, though William III continued to rule until 1702. +The `commonwealth` field is used to flag the period from 1649 to 1660, which includes the Commonwealth of England, the Protectorate, +and the period leading to the Restoration. While historically more accurate to call this the "interregnum," the field name of `commonwealth` +from the original dataset is retained for backwards compatibility. +The dataset was revised in Aug. 2024. James II's reign now ends in 1688 (previously 1689). +Source data has been verified against the kings & queens and interregnum pages of the official website of the British royal family (retrieved in Aug. 2024). +Content on the site is protected by Crown Copyright. +Under the [UK Government Licensing Framework](https://www.nationalarchives.gov.uk/information-management/re-using-public-sector-information/uk-government-licensing-framework/crown-copyright/), most +Crown copyright information is available under the [Open Government Licence v3.0](https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/). ### schema | name | type | description | @@ -854,7 +854,7 @@ A chronological list of English and British monarchs from Elizabeth I through Ge movies.json ### description The dataset has well known and intentionally included errors. - This dataset is provided for instructional purposes, including the need to reckon with dirty data. +This dataset is provided for instructional purposes, including the need to reckon with dirty data. ### schema | name | type | @@ -899,8 +899,8 @@ obesity.json ohlc.json ### description This dataset contains the performance of the Chicago Board Options Exchange - [Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/ - %5EVIX#overview)) in the summer of 2009. +[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/ +%5EVIX#overview)) in the summer of 2009. ### schema | name | type | @@ -921,9 +921,9 @@ This dataset contains the performance of the Chicago Board Options Exchange penguins.json ### description Palmer Archipelago (Antarctica) penguin data collected and made available by - [Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) - and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research - Network](https://lternet.edu/). +[Dr. Kristen Gorman](https://www.uaf.edu/cfos/people/faculty/detail/kristen-gorman.php) +and the Palmer Station, Antarctica LTER, a member of the [Long Term Ecological Research +Network](https://lternet.edu/). ### schema | name | type | @@ -975,24 +975,24 @@ points.json political-contributions.json ### description Summary financial information on contributions to candidates for U.S. - elections. An updated version of this datset is available from the "all candidates" files - (in pipe-delimited format) on the bulk data download page of the U.S. Federal Election - Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is - available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). - The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. +elections. An updated version of this datset is available from the "all candidates" files +(in pipe-delimited format) on the bulk data download page of the U.S. Federal Election +Commission, or, alternatively, via OpenFEC. Information on each of the 25 columns is +available from the [FEC All Candidates File Description](https://www.fec.gov/campaign-finance-data/all-candidates-file-description/). +The sample dataset in `political-contributions.json` contains 58 records with dates from 2015. - FEC data is subject to the commission's: - - [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/) - - [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/) - - [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md) +FEC data is subject to the commission's: +- [Sale or Use Policy](https://www.fec.gov/updates/sale-or-use-contributor-information/) +- [Privacy and Security Policy](https://www.fec.gov/about/privacy-and-security-policy/) +- [Acceptable Use Policy](https://github.com/fecgov/FEC/blob/master/ACCEPTABLE-USE-POLICY.md) - Additionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states: - > This project is in the public domain within the United States, and we waive worldwide - > copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/) - > dedication. Read more on our license page. - > A few restrictions limit the way you can use FEC data. For example, you can't use - > contributor lists for commercial purposes or to solicit donations. Learn more on - > [FEC.gov](https://www.fec.gov/). +Additionally, the FEC's Github [repository](https://github.com/fecgov/FEC) states: +> This project is in the public domain within the United States, and we waive worldwide +> copyright and related rights through [CC0 universal public domain](https://creativecommons.org/publicdomain/zero/1.0/) +> dedication. Read more on our license page. +> A few restrictions limit the way you can use FEC data. For example, you can't use +> contributor lists for commercial purposes or to solicit donations. Learn more on +> [FEC.gov](https://www.fec.gov/). ### schema | name | type | @@ -1032,17 +1032,17 @@ Summary financial information on contributions to candidates for U.S. population.json ### description United States population statistics by sex and age group across decades between 1850 and 2000. - The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census - microdata" from as early as 1790. +The dataset was obtained from IPUMS USA, which "collects, preserves and harmonizes U.S. census +microdata" from as early as 1790. - IPUMS updates and revises datasets over time, which may result in discrepancies between this - dataset and current IPUMS data. Details on data revisions are available here. +IPUMS updates and revises datasets over time, which may result in discrepancies between this +dataset and current IPUMS data. Details on data revisions are available here. - When using this dataset, please refer to IPUMS USA terms of use. The organization requests the - use of the following citation for this json file: - Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated - Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. - http://doi.org/10.18128/D010.V6.0 +When using this dataset, please refer to IPUMS USA terms of use. The organization requests the +use of the following citation for this json file: +Steven Ruggles, Katie Genadek, Ronald Goeken, Josiah Grover, and Matthew Sobek. Integrated +Public Use Microdata Series: Version 6.0. Minneapolis: University of Minnesota, 2015. +http://doi.org/10.18128/D010.V6.0 ### schema @@ -1061,7 +1061,7 @@ United States population statistics by sex and age group across decades between population_engineers_hurricanes.csv ### description Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, - [Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html) +[Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html) ### schema | name | type | @@ -1082,10 +1082,10 @@ Per-state data on population, number of engineers, and hurricanes. Used in Vega- seattle-weather-hourly-normals.csv ### description Hourly weather normals with metric units. The 1981-2010 Climate Normals are - NCDC's three-decade averages of climatological variables, including temperature and - precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). - We only included temperature, wind, and pressure - and updated the format to be easier to parse. +NCDC's three-decade averages of climatological variables, including temperature and +precipitation. Learn more in the [documentation](https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/NORMAL_HLY_documentation.pdf). +We only included temperature, wind, and pressure +and updated the format to be easier to parse. ### schema | name | type | @@ -1103,8 +1103,8 @@ Hourly weather normals with metric units. The 1981-2010 Climate Normals are seattle-weather.csv ### description Daily weather records with metric units. Transformed using `/scripts/weather.py`. - The categorical "weather" field is synthesized from multiple fields in the original dataset. - This data is intended for instructional purposes. +The categorical "weather" field is synthesized from multiple fields in the original dataset. +This data is intended for instructional purposes. ### schema | name | type | @@ -1172,22 +1172,22 @@ udistrict.json unemployment-across-industries.json ### description Industry-level unemployment statistics from the Current Population Survey - (CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons - and unemployment rate across 11 private industries, as well as agricultural, government, and - self-employed workers. Covers January 2000 through February 2010. Industry classification - follows format of CPS Table A-31. +(CPS), published monthly by the U.S. Bureau of Labor Statistics. Includes unemployed persons +and unemployment rate across 11 private industries, as well as agricultural, government, and +self-employed workers. Covers January 2000 through February 2010. Industry classification +follows format of CPS Table A-31. - The dataset can be replicated using the BLS API. For more, see the `scripts` folder of this - repository. +The dataset can be replicated using the BLS API. For more, see the `scripts` folder of this +repository. - The BLS Web site states: - > "Users of the public API should cite the date that data were accessed or retrieved using - > the API. Users must clearly state that "BLS.gov cannot vouch for the data or analyses - > derived from these data after the data have been retrieved from BLS.gov." The BLS.gov logo - > may not be used by persons who are not BLS employees or on products (including web pages) - > that are not BLS-sponsored." +The BLS Web site states: +> "Users of the public API should cite the date that data were accessed or retrieved using +> the API. Users must clearly state that "BLS.gov cannot vouch for the data or analyses +> derived from these data after the data have been retrieved from BLS.gov." The BLS.gov logo +> may not be used by persons who are not BLS employees or on products (including web pages) +> that are not BLS-sponsored." - See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm). +See full BLS [terms of service](https://www.bls.gov/developers/termsOfService.htm). ### schema | name | type | description | @@ -1209,31 +1209,31 @@ Industry-level unemployment statistics from the Current Population Survey unemployment.tsv ### description This dataset contains county-level unemployment rates in the United States, with data generally - consistent with levels reported in 2009. The dataset is structured as tab-separated values. - The unemployment rate represents the number of unemployed persons as a percentage of the labor - force. According to the Bureau of Labor Statistics (BLS) glossary: +consistent with levels reported in 2009. The dataset is structured as tab-separated values. +The unemployment rate represents the number of unemployed persons as a percentage of the labor +force. According to the Bureau of Labor Statistics (BLS) glossary: - Unemployed persons (Current Population Survey) [are] persons aged 16 years and older who had - no employment during the reference week, were available for work, except for temporary - illness, and had made specific efforts to find employment sometime during the 4-week period - ending with the reference week. Persons who were waiting to be recalled to a job from which - they had been laid off need not have been looking for work to be classified as unemployed. +Unemployed persons (Current Population Survey) [are] persons aged 16 years and older who had +no employment during the reference week, were available for work, except for temporary +illness, and had made specific efforts to find employment sometime during the 4-week period +ending with the reference week. Persons who were waiting to be recalled to a job from which +they had been laid off need not have been looking for work to be classified as unemployed. - This dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, - a federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). - The LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions, - states, counties, metropolitan areas, and many cities and towns. +This dataset is derived from the [Local Area Unemployment Statistics (LAUS)](https://www.bls.gov/lau/) program, +a federal-state cooperative effort overseen by the Bureau of Labor Statistics (BLS). +The LAUS program produces monthly and annual employment, unemployment, and labor force data for census regions and divisions, +states, counties, metropolitan areas, and many cities and towns. - For the most up-to-date LAUS data: - 1. **Monthly and Annual Data Downloads**: - - Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) - and [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data. - 2. **BLS Public Data API**: - - The BLS provides an API for developers to access various datasets, including LAUS data. - - To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query. - - API documentation and examples are available on the BLS Developers page. +For the most up-to-date LAUS data: +1. **Monthly and Annual Data Downloads**: +- Visit the [LAUS Data Tools](https://www.bls.gov/lau/data.htm) page for [monthly](https://www.bls.gov/lau/tables.htm#mcounty) +and [annual](https://www.bls.gov/lau/tables.htm#cntyaa) county data. +2. **BLS Public Data API**: +- The BLS provides an API for developers to access various datasets, including LAUS data. +- To use the API for LAUS data, refer to the [LAUS Series ID Formats](https://www.bls.gov/help/hlpforma.htm#LA) to construct your query. +- API documentation and examples are available on the BLS Developers page. - When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm). +When using BLS public data API and datasets, users should adhere to the [BLS Terms of Service](https://www.bls.gov/developers/termsOfService.htm). ### schema | name | type | description | @@ -1262,21 +1262,21 @@ us-10m.json us-employment.csv ### description In the mid 2000s the global economy was hit by a crippling recession. One result: Massive job - losses across the United States. The downturn in employment, and the slow recovery in hiring that - followed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau - of Labor Statistics. +losses across the United States. The downturn in employment, and the slow recovery in hiring that +followed, was tracked each month by the Current Employment Statistics program at the U.S. Bureau +of Labor Statistics. - This file contains the monthly employment total in a variety of job categories from January 2006 - through December 2015. The numbers are seasonally adjusted and reported in thousands. The data - were downloaded on Nov. 11, 2018, and reformatted for use in this library. +This file contains the monthly employment total in a variety of job categories from January 2006 +through December 2015. The numbers are seasonally adjusted and reported in thousands. The data +were downloaded on Nov. 11, 2018, and reformatted for use in this library. - Totals are included for the [22 "supersectors"](https://download.bls.gov/pub/time.series/ce/ce.supersector) - tracked by the BLS. The "nonfarm" total is the category typically used by - economists and journalists as a stand-in for the country's employment total. +Totals are included for the [22 "supersectors"](https://download.bls.gov/pub/time.series/ce/ce.supersector) +tracked by the BLS. The "nonfarm" total is the category typically used by +economists and journalists as a stand-in for the country's employment total. - A calculated "nonfarm_change" column has been appended with the month-to-month change in that - supersector's employment. It is useful for illustrating how to make bar charts that report both - negative and positive values. +A calculated "nonfarm_change" column has been appended with the month-to-month change in that +supersector's employment. It is useful for illustrating how to make bar charts that report both +negative and positive values. ### schema @@ -1326,8 +1326,8 @@ us-state-capitals.json volcano.json ### description Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field. - This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a - topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate. +This data set gives topographic information for Maunga Whau on a 10m by 10m grid. Digitized from a +topographic map by Ross Ihaka, adapted from R datasets. These data should not be regarded as accurate. ### sources | title | path | |:-----------|:---------------------------------------------------------------------------| @@ -1337,7 +1337,7 @@ Maunga Whau (Mt Eden) is one of about 50 volcanos in the Auckland volcanic field weather.csv ### description NOAA data transformed using `/scripts/weather.py`. Categorical "weather" field synthesized - from multiple fields in the original dataset. This data is intended for instructional purposes. +from multiple fields in the original dataset. This data is intended for instructional purposes. ### schema | name | type | @@ -1363,10 +1363,10 @@ Instructional dataset showing actual and predicted temperature data. wheat.json ### description In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), - a Scottish engineer who is often credited as the founder of statistical graphics, - published an elegant chart on the price of wheat. It plots 250 years of prices alongside - weekly wages and the reigning monarch. He intended to demonstrate that "never at any former period - was wheat so cheap, in proportion to mechanical labour, as it is at the present time." +a Scottish engineer who is often credited as the founder of statistical graphics, +published an elegant chart on the price of wheat. It plots 250 years of prices alongside +weekly wages and the reigning monarch. He intended to demonstrate that "never at any former period +was wheat so cheap, in proportion to mechanical labour, as it is at the present time." ### schema | name | type | diff --git a/datapackage.json b/datapackage.json index 8de71218..41d9b7da 100644 --- a/datapackage.json +++ b/datapackage.json @@ -20,7 +20,7 @@ } ], "version": "2.11.0", - "created": "2024-12-13T21:05:54.344553+00:00", + "created": "2024-12-13T21:17:21.482195+00:00", "resources": [ { "name": "7zip.png", From 8b09c39ef827c7b3e6e3aa6083f40f91f1ca2bc8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 21:19:53 +0000 Subject: [PATCH 33/40] fix: correct `burtin.json` quote breaks https://github.com/vega/vega-datasets/pull/643#discussion_r1884021300 --- _data/datapackage_additions.toml | 14 +++++++------- datapackage-tabular.md | 14 +++++++------- datapackage.json | 4 ++-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/_data/datapackage_additions.toml b/_data/datapackage_additions.toml index 56665c7a..a950d28e 100644 --- a/_data/datapackage_additions.toml +++ b/_data/datapackage_additions.toml @@ -96,23 +96,23 @@ reads as follows: > > > The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in \ -> red and gram- in blue) with their sensitivities to penicillin, and streptomycin. +red and gram- in blue) with their sensitivities to penicillin, and streptomycin. > > The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits \ -> the test organism. +the test organism. > > High dilutions are toward the periphery; consequently the length of the colored bar is proportional \ -> to the effectiveness. +to the effectiveness. > > It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. \ -> fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. \ -> vulgaris, S. schottmuelleri and M. tuberculosis. +fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. \ +vulgaris, S. schottmuelleri and M. tuberculosis. > > Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to \ -> neomycin, although the majority of these are sensitive to neomycin. +neomycin, although the majority of these are sensitive to neomycin. > > It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is \ -> not understood. +not understood. """ [[resources.sources]] diff --git a/datapackage-tabular.md b/datapackage-tabular.md index bf9bb72a..085ac57a 100644 --- a/datapackage-tabular.md +++ b/datapackage-tabular.md @@ -1,5 +1,5 @@ # vega-datasets -`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-13 21:17:21 [UTC] +`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-13 21:19:02 [UTC] Common repository for example datasets used by Vega related projects. @@ -224,17 +224,17 @@ reads as follows: > ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin > > -> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in > red and gram- in blue) with their sensitivities to penicillin, and streptomycin. +> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin. > -> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits > the test organism. +> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits the test organism. > -> High dilutions are toward the periphery; consequently the length of the colored bar is proportional > to the effectiveness. +> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness. > -> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. > fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. > vulgaris, S. schottmuelleri and M. tuberculosis. +> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis. > -> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to > neomycin, although the majority of these are sensitive to neomycin. +> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin. > -> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood. +> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood. ### schema diff --git a/datapackage.json b/datapackage.json index 41d9b7da..e821a18d 100644 --- a/datapackage.json +++ b/datapackage.json @@ -20,7 +20,7 @@ } ], "version": "2.11.0", - "created": "2024-12-13T21:17:21.482195+00:00", + "created": "2024-12-13T21:19:02.336933+00:00", "resources": [ { "name": "7zip.png", @@ -587,7 +587,7 @@ { "name": "burtin.json", "type": "table", - "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in > red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits > the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional > to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. > fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. > vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to > neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is > not understood.\n", + "description": "The burtin.json dataset is based on graphic designer Will Burtin's 1951 visualization of antibiotic effectiveness, originally published in Scope Magazine.\n\nThe dataset compares the performance of three antibiotics against 16 different bacteria.\n\nNumerical values in the dataset represent the minimum inhibitory concentration (MIC) of each antibiotic, measured in units per milliliter, with lower values indicating higher antibiotic effectiveness.\n\nThe dataset was featured as an example in the Protovis project, a precursor to D3.js.\n\nAs noted in the Protovis example, \"Recreating this display revealed some minor errors in the original: a missing grid line at 0.01 μg/ml, and an exaggeration of some values for penicillin\".\n\nThe vega-datsets version is largely consistent with the Protovis version of the dataset, with one correction (changing 'Brucella antracis' to the correct 'Bacillus anthracis') and the addition of a new column, 'Genus', to group related bacterial species together.\n\nThe caption of the original 1951 [visualization](https://graphicdesignarchives.org/wp-content/uploads/wmgda_8616c.jpg) \nreads as follows:\n\n> ## Antibacterial ranges of Neomycin, Penicillin and Streptomycin\n>\n>\n> The chart compares the in vitro sensitivities to neomycin of some of the common pathogens (gram+ in red and gram- in blue) with their sensitivities to penicillin, and streptomycin.\n>\n> The effectiveness of the antibiotics is expressed as the highest dilution in μ/ml. which inhibits the test organism.\n>\n> High dilutions are toward the periphery; consequently the length of the colored bar is proportional to the effectiveness.\n>\n> It is apparent that neomycin is especially effective against Staph. albus and aureus, Streph. fecalis, A. aerogenes, S. typhosa, E. coli, Ps. aeruginosa, Br. abortus, K. pneumoniae, Pr. vulgaris, S. schottmuelleri and M. tuberculosis.\n>\n> Unfortunately, some strains of proteus, pseudomonas and hemolytic streptococcus are resistant to neomycin, although the majority of these are sensitive to neomycin.\n>\n> It also inhibits actinomycetes, but is inactive against viruses and fungi. Its mode of action is not understood.\n", "sources": [ { "title": "Scope Magazine", From 9c9eefa884d9f11bc144faf2a14f6c7c52158fc2 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 21:31:26 +0000 Subject: [PATCH 34/40] fix: more markdown fixes --- _data/datapackage_additions.toml | 11 ++++++----- datapackage-tabular.md | 12 ++++++------ datapackage.json | 8 ++++---- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/_data/datapackage_additions.toml b/_data/datapackage_additions.toml index a950d28e..ca53c39b 100644 --- a/_data/datapackage_additions.toml +++ b/_data/datapackage_additions.toml @@ -618,8 +618,8 @@ path = "obesity.json" [[resources]] # Path: ohlc.json path = "ohlc.json" description = """This dataset contains the performance of the Chicago Board Options Exchange -[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/ -%5EVIX#overview)) in the summer of 2009.""" +[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview)) +in the summer of 2009.""" [[resources.sources]] title = "Yahoo Finance VIX Data" @@ -720,7 +720,7 @@ path = "https://usa.ipums.org/usa/" [[resources]] # Path: population_engineers_hurricanes.csv path = "population_engineers_hurricanes.csv" description = """Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, -[Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)""" +[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)""" [[resources.sources]] title = "Bureau of Labor Statistics" @@ -934,8 +934,9 @@ path = "wheat.json" description = """In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published an elegant chart on the price of wheat. It plots 250 years of prices alongside -weekly wages and the reigning monarch. He intended to demonstrate that "never at any former period -was wheat so cheap, in proportion to mechanical labour, as it is at the present time."""" +weekly wages and the reigning monarch. He intended to demonstrate that: +> "never at any former period was wheat so cheap, in proportion to mechanical labour, \ +as it is at the present time."""" [[resources.sources]] title = "1822 Playfair Chart" diff --git a/datapackage-tabular.md b/datapackage-tabular.md index 085ac57a..1e042cc8 100644 --- a/datapackage-tabular.md +++ b/datapackage-tabular.md @@ -1,5 +1,5 @@ # vega-datasets -`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-13 21:19:02 [UTC] +`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-13 21:28:31 [UTC] Common repository for example datasets used by Vega related projects. @@ -899,8 +899,8 @@ obesity.json ohlc.json ### description This dataset contains the performance of the Chicago Board Options Exchange -[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/ -%5EVIX#overview)) in the summer of 2009. +[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview)) +in the summer of 2009. ### schema | name | type | @@ -1061,7 +1061,7 @@ http://doi.org/10.18128/D010.V6.0 population_engineers_hurricanes.csv ### description Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example, -[Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html) +[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html) ### schema | name | type | @@ -1365,8 +1365,8 @@ wheat.json In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair), a Scottish engineer who is often credited as the founder of statistical graphics, published an elegant chart on the price of wheat. It plots 250 years of prices alongside -weekly wages and the reigning monarch. He intended to demonstrate that "never at any former period -was wheat so cheap, in proportion to mechanical labour, as it is at the present time." +weekly wages and the reigning monarch. He intended to demonstrate that: +> "never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time." ### schema | name | type | diff --git a/datapackage.json b/datapackage.json index e821a18d..55cebf47 100644 --- a/datapackage.json +++ b/datapackage.json @@ -20,7 +20,7 @@ } ], "version": "2.11.0", - "created": "2024-12-13T21:19:02.336933+00:00", + "created": "2024-12-13T21:28:31.574330+00:00", "resources": [ { "name": "7zip.png", @@ -2052,7 +2052,7 @@ { "name": "ohlc.json", "type": "table", - "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/\n%5EVIX#overview)) in the summer of 2009.", + "description": "This dataset contains the performance of the Chicago Board Options Exchange \n[Volatility Index](https://en.wikipedia.org/wiki/VIX) ([VIX](https://finance.yahoo.com/chart/%5EVIX#overview))\nin the summer of 2009.", "sources": [ { "title": "Yahoo Finance VIX Data", @@ -2419,7 +2419,7 @@ { "name": "population_engineers_hurricanes.csv", "type": "table", - "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint DAta from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", + "description": "Per-state data on population, number of engineers, and hurricanes. Used in Vega-Lite example,\n[Three Choropleths Representing Disjoint Data from the Same Table](https://vega.github.io/vega-lite/examples/geo_repeat.html)", "sources": [ { "title": "Bureau of Labor Statistics", @@ -3028,7 +3028,7 @@ { "name": "wheat.json", "type": "table", - "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that \"never at any former period \nwas wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"", + "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"", "sources": [ { "title": "1822 Playfair Chart", From 6998e0736faaf2dcb59b5625453c4fe3dd173f98 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Dec 2024 21:35:23 +0000 Subject: [PATCH 35/40] fix: move triple quotes to own line Seems to be parsed fine, but confused the language server in vscode --- _data/datapackage_additions.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_data/datapackage_additions.toml b/_data/datapackage_additions.toml index ca53c39b..5dc20b9d 100644 --- a/_data/datapackage_additions.toml +++ b/_data/datapackage_additions.toml @@ -936,7 +936,8 @@ a Scottish engineer who is often credited as the founder of statistical graphics published an elegant chart on the price of wheat. It plots 250 years of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that: > "never at any former period was wheat so cheap, in proportion to mechanical labour, \ -as it is at the present time."""" +as it is at the present time." +""" [[resources.sources]] title = "1822 Playfair Chart" From c97ad7d156f35cfcc060fd00700012a5dd3cdf60 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 14 Dec 2024 15:05:55 +0000 Subject: [PATCH 36/40] refactor: rename `datapackage-tabular.md` -> `datapackage.md` https://github.com/vega/vega-datasets/pull/643#discussion_r1884612687 --- datapackage.json | 4 ++-- datapackage-tabular.md => datapackage.md | 3 ++- scripts/build_datapackage.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) rename datapackage-tabular.md => datapackage.md (99%) diff --git a/datapackage.json b/datapackage.json index 55cebf47..4fa26267 100644 --- a/datapackage.json +++ b/datapackage.json @@ -20,7 +20,7 @@ } ], "version": "2.11.0", - "created": "2024-12-13T21:28:31.574330+00:00", + "created": "2024-12-14T15:02:06.096232+00:00", "resources": [ { "name": "7zip.png", @@ -3028,7 +3028,7 @@ { "name": "wheat.json", "type": "table", - "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"", + "description": "In an 1822 letter to Parliament, [William Playfair](https://en.wikipedia.org/wiki/William_Playfair),\na Scottish engineer who is often credited as the founder of statistical graphics, \npublished an elegant chart on the price of wheat. It plots 250 years of prices alongside \nweekly wages and the reigning monarch. He intended to demonstrate that:\n> \"never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time.\"\n", "sources": [ { "title": "1822 Playfair Chart", diff --git a/datapackage-tabular.md b/datapackage.md similarity index 99% rename from datapackage-tabular.md rename to datapackage.md index 1e042cc8..b781bb07 100644 --- a/datapackage-tabular.md +++ b/datapackage.md @@ -1,5 +1,5 @@ # vega-datasets -`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-13 21:28:31 [UTC] +`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-14 15:02:06 [UTC] Common repository for example datasets used by Vega related projects. @@ -1367,6 +1367,7 @@ a Scottish engineer who is often credited as the founder of statistical graphics published an elegant chart on the price of wheat. It plots 250 years of prices alongside weekly wages and the reigning monarch. He intended to demonstrate that: > "never at any former period was wheat so cheap, in proportion to mechanical labour, as it is at the present time." + ### schema | name | type | diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index 60661f62..b4163eaf 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -526,7 +526,7 @@ def write_package(pkg: Package, repo_dir: Path, *formats: OutputFormat) -> None: configs: dict[OutputFormat, tuple[str, PackageMethod[str]]] = { "json": (".json", partial(Package.to_json)), "yaml": (".yaml", partial(Package.to_yaml)), - "md": ("-tabular.md", partial(Package.to_markdown, table=True)), + "md": (".md", partial(Package.to_markdown, table=True)), } for fmt in formats: postfix, fn = configs[fmt] From d550d8a862165854240b6b7777feae0f9da65937 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 14 Dec 2024 15:13:27 +0000 Subject: [PATCH 37/40] docs: Update `README.md` links Pre-merging, you can use these instead: https://github.com/dsmedia/vega-datasets/blob/main/datapackage.md#resources https://github.com/dsmedia/vega-datasets/blob/main/datapackage.json --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b450ca9e..56dfc62f 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,9 @@ Collection of datasets used in Vega and Vega-Lite examples. This data lives at h Common repository for example datasets used by Vega related projects. Keep changes to this repository minimal as other projects (Vega, Vega Editor, Vega-Lite, Polestar, Voyager) use this data in their tests and for examples. -The list of sources is in [SOURCES.md](https://github.com/vega/vega-datasets/blob/next/SOURCES.md). +The list of sources is in [datapackage.md](https://github.com/vega/vega-datasets/blob/next/datapackage.md#resources). +This metadata is also available in a machine-readable format at [datapackage.json](https://github.com/vega/vega-datasets/blob/next/datapackage.json). + To access the data in Observable, you can import `vega-dataset`. Try our [example notebook](https://observablehq.com/@vega/vega-datasets). To access these datasets from Python, you can use the [Vega datasets python package](https://github.com/altair-viz/vega_datasets). To access them from Julia, you can use the [VegaDatasets.jl julia package](https://github.com/davidanthoff/VegaDatasets.jl). From 5ce07d5fc026d638eebff10b2ab68e19b51f548c Mon Sep 17 00:00:00 2001 From: Daniel Sorid <63077097+dsmedia@users.noreply.github.com> Date: Sat, 14 Dec 2024 16:26:28 +0000 Subject: [PATCH 38/40] docs: update description with dataset license information Top-level description included in datapackage_additions.toml overrides the value pulled in from package.json. --- _data/datapackage_additions.toml | 7 +++++++ datapackage.json | 4 ++-- datapackage.md | 8 ++++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/_data/datapackage_additions.toml b/_data/datapackage_additions.toml index 5dc20b9d..f5d8a381 100644 --- a/_data/datapackage_additions.toml +++ b/_data/datapackage_additions.toml @@ -1,3 +1,10 @@ +description = """ +Common repository for example datasets used by Vega related projects. +BSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets +complies with the license terms of the original sources. Dataset license information, where included, +is a reference starting point only and is provided without any warranty of accuracy or completeness. +""" + [[licenses]] name = "BSD-3-Clause" path = "https://opensource.org/license/bsd-3-clause" diff --git a/datapackage.json b/datapackage.json index 4fa26267..0b47f4b8 100644 --- a/datapackage.json +++ b/datapackage.json @@ -1,6 +1,6 @@ { "name": "vega-datasets", - "description": "Common repository for example datasets used by Vega related projects.", + "description": "Common repository for example datasets used by Vega related projects. \nBSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets \ncomplies with the license terms of the original sources. Dataset license information, where included, \nis a reference starting point only and is provided without any warranty of accuracy or completeness.\n", "homepage": "http://github.com/vega/vega-datasets.git", "licenses": [ { @@ -20,7 +20,7 @@ } ], "version": "2.11.0", - "created": "2024-12-14T15:02:06.096232+00:00", + "created": "2024-12-14T16:23:18.638262+00:00", "resources": [ { "name": "7zip.png", diff --git a/datapackage.md b/datapackage.md index b781bb07..12532a44 100644 --- a/datapackage.md +++ b/datapackage.md @@ -1,7 +1,11 @@ # vega-datasets -`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-14 15:02:06 [UTC] +`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-14 16:23:18 [UTC] + +Common repository for example datasets used by Vega related projects. +BSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets +complies with the license terms of the original sources. Dataset license information, where included, +is a reference starting point only and is provided without any warranty of accuracy or completeness. -Common repository for example datasets used by Vega related projects. ## licenses | name | path | title | From 257bcb719dbdec0cf9c466ab1d88d27c7b33f84c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 14 Dec 2024 19:38:03 +0000 Subject: [PATCH 39/40] fix: remove overriden `description` Warning was emitted following (https://github.com/vega/vega-datasets/pull/643/commits/5ce07d5fc026d638eebff10b2ab68e19b51f548c) --- datapackage.json | 2 +- datapackage.md | 2 +- scripts/build_datapackage.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/datapackage.json b/datapackage.json index 0b47f4b8..1fb947eb 100644 --- a/datapackage.json +++ b/datapackage.json @@ -20,7 +20,7 @@ } ], "version": "2.11.0", - "created": "2024-12-14T16:23:18.638262+00:00", + "created": "2024-12-14T19:35:54.935236+00:00", "resources": [ { "name": "7zip.png", diff --git a/datapackage.md b/datapackage.md index 12532a44..827b7452 100644 --- a/datapackage.md +++ b/datapackage.md @@ -1,5 +1,5 @@ # vega-datasets -`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-14 16:23:18 [UTC] +`2.11.0` | [GitHub](http://github.com/vega/vega-datasets.git) | 2024-12-14 19:35:54 [UTC] Common repository for example datasets used by Vega related projects. BSD-3-Clause license applies only to package code and infrastructure. Users should verify their use of datasets diff --git a/scripts/build_datapackage.py b/scripts/build_datapackage.py index b4163eaf..96b74bfc 100755 --- a/scripts/build_datapackage.py +++ b/scripts/build_datapackage.py @@ -364,7 +364,7 @@ class PackageMeta(TypedDict): name: str version: str homepage: str - description: str + description: NotRequired[str] licenses: NotRequired[Sequence[License]] contributors: Sequence[Contributor] sources: NotRequired[Sequence[Source]] @@ -391,7 +391,6 @@ def _extract_npm_metadata(m: Mapping[str, Any], /) -> PackageMeta: name=m["name"], version=m["version"], homepage=m["repository"]["url"], - description=m["description"], contributors=[Contributor(title=m["author"]["name"], path=m["author"]["url"])], created=dt.datetime.now(dt.UTC).isoformat(), ) From 755976fcd66aaa918acdd35d00dac92ba054669b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 15 Dec 2024 18:21:29 +0000 Subject: [PATCH 40/40] ci: remove `build_datapackage.py` exclude --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b1224e53..4a803719 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,8 +25,6 @@ extend-exclude = [ "scripts/update_countries_json.py", "scripts/update_gapminder.py", "scripts/weather.py", - # TEMPORARY - "scripts/build_datapackage.py", ] format = { docstring-code-format = true, preview = true } target-version = "py312"