Queries to the Data Lake#
A set of Spark-flavoured SQL queries that gather relevant data from the Wikimedia Foundation’s Analytics Data Lake.
- image_suggestions.queries.wiki_sizes = "SELECT wiki_db, COUNT(*) AS size\nFROM wmf_raw.mediawiki_page\nWHERE snapshot='{}'\nAND page_namespace=0\nAND page_is_redirect=0\nGROUP BY wiki_db\n"#
Compute the amount of article pages per wiki, redirects excluded.
- image_suggestions.queries.wikidata_items_with_P18 = 'SELECT id AS item_id,\nreplace(regexp_extract(claim.mainSnak.datavalue.value, \'^"(.*)"$\', 1), \' \', \'_\') AS value\nFROM wmf.wikidata_entity\nLATERAL VIEW OUTER explode(claims) AS claim\nWHERE snapshot=\'{}\'\nAND typ=\'item\'\nAND claim.mainSnak.property=\'P18\'\n'#
-
regexp_extract removes wrapping quotes from string values and replaces spaces with underscores to match image file names in page_title’s format.
- image_suggestions.queries.wikidata_items_with_P373 = 'SELECT id AS item_id,\nreplace(regexp_extract(claim.mainSnak.datavalue.value, \'^"(.*)"$\', 1), \' \', \'_\') AS value\nFROM wmf.wikidata_entity\nLATERAL VIEW OUTER explode(claims) AS claim\nWHERE snapshot=\'{}\'\nAND typ=\'item\'\nAND claim.mainSnak.property=\'P373\'\n'#
Gather Commons category Wikidata claims.
regexp_extract removes wrapping quotes from string values and replaces spaces with underscores to match category names elsewhere.
- image_suggestions.queries.wikidata_items_with_P31 = "SELECT id AS item_id,\nfrom_json(claim.mainSnak.dataValue.value, 'entityType STRING, numericId INT, id STRING').id AS value\nFROM wmf.wikidata_entity\nLATERAL VIEW OUTER explode(claims) AS claim\nWHERE snapshot='{}'\nAND typ='item'\nAND claim.mainSnak.property='P31'\n"#
Gather instance of Wikidata claims.
from_json extracts Wikidata QIDs, which are stored as JSON strings in claims.
- image_suggestions.queries.commons_pages_with_depicts = "SELECT DISTINCT\nfrom_json(statement.mainsnak.datavalue.value, 'entityType STRING, numericId INT, id STRING').id AS item_id,\nSUBSTRING(id, 2) AS page_id,\nstatement.mainsnak.property AS property_id\nFROM structured_data.commons_entity\nLATERAL VIEW OUTER explode(statements) AS statement\nWHERE snapshot='{}'\nAND statement.mainsnak.property IN ('P180', 'P6243', 'P921')\n"#
Gather Commons depicts statements.
depicts, main subject, and is digital representation of Wikidata properties are all used to represent similar information.
- image_suggestions.queries.commons_file_pages = "SELECT page_id, page_title\nFROM wmf_raw.mediawiki_page\nWHERE snapshot='{}'\nAND wiki_db='commonswiki'\nAND page_namespace=6\nAND page_is_redirect=0\n"#
Gather Commons file page IDs and titles.
- image_suggestions.queries.local_images = "SELECT wiki_db, page_id, page_title\nFROM wmf_raw.mediawiki_page\nWHERE snapshot='{}'\nAND wiki_db!='commonswiki'\nAND page_namespace=6\nAND page_is_redirect=0\n"#
Gather file page IDs and titles locally stored in wikis.
- image_suggestions.queries.category_links = "SELECT cl_from AS page_id, cl_to AS cat_title\nFROM wmf_raw.mediawiki_categorylinks\nWHERE snapshot='{}'\nAND wiki_db='commonswiki'\nAND cl_type='file'\n"#
Gather Commons categories linked to file page IDs.
- image_suggestions.queries.categories = "SELECT cat_title, cat_pages\nFROM wmf_raw.mediawiki_category\nWHERE snapshot='{}'\nAND wiki_db='commonswiki'\nAND cat_pages<100000\nAND cat_pages>0\n"#
Gather Commons categories used by less than 100 k pages.
- image_suggestions.queries.non_commons_main_pages = "SELECT wiki_db, page_id, page_title\nFROM wmf_raw.mediawiki_page\nWHERE snapshot='{}'\nAND wiki_db!='commonswiki'\nAND page_namespace=0\nAND page_is_redirect=0\n"#
Gather article pages of all wikis but Commons.
- image_suggestions.queries.pagelinks = "SELECT pl.wiki_db, lt_title AS to_title, pl_from AS from_id\nFROM wmf_raw.mediawiki_pagelinks pl\nINNER JOIN wmf_raw.mediawiki_private_linktarget lt\nON pl.snapshot=lt.snapshot\nAND pl.wiki_db=lt.wiki_db\nAND pl.pl_target_id=lt.lt_id\nWHERE pl.snapshot='{}'\n"#
Gather all page links.
- image_suggestions.queries.pages_with_lead_images = "SELECT wiki_db, pp_page AS page_id, pp_value AS lead_image_title\nFROM wmf_raw.mediawiki_page_props\nWHERE snapshot='{}'\nAND wiki_db!='commonswiki'\nAND pp_propname='page_image_free'\n"#
Gather page IDs with lead image file names from all wikis but Commons.
- image_suggestions.queries.wikidata_item_page_links = "SELECT item_id, wiki_db, page_id\nFROM wmf.wikidata_item_page_link\nWHERE snapshot='{}'\nAND page_namespace=0\n"#
Gather page IDs linked to Wikidata QIDs.
- image_suggestions.queries.imagelinks = "SELECT wiki_db, il_from AS article_id, il_to AS image_title\nFROM wmf_raw.mediawiki_imagelinks\nWHERE snapshot='{}'\nAND wiki_db!='commonswiki'\nAND il_from_namespace=0\n"#
Gather image file names linked to article pages of all wikis but Commons.
- image_suggestions.queries.latest_revisions = "SELECT wiki_db, rev_page AS page_id, MAX(rev_id) AS rev_id\nFROM wmf_raw.mediawiki_revision\nWHERE snapshot='{}'\nGROUP BY wiki_db, rev_page\n"#
Gather page IDs with their latest revisions.
- image_suggestions.queries.suggestions_with_feedback = "SELECT wiki, page_id, filename\nFROM event_sanitized.mediawiki_image_suggestions_feedback\nWHERE datacenter!=''\nAND year>=2022 AND month>0 AND day>0 AND hour<24\nAND (is_rejected=True OR is_accepted=True)\n"#
Gather image suggestions’ user feedback.
- image_suggestions.queries.cirrus_index_tags = "SELECT wiki, namespace, page_id, weighted_tags\nFROM discovery.cirrus_index_without_content\nWHERE cirrus_replica='codfw'\nAND snapshot='{}'\n"#
Gather Cirrus search index weighted tags available in production. Used as a previous state to compute the search index delta. The expected snapshot is YYYYMMDD.