Queries to the Data Lake#

A set of Spark-flavoured SQL queries that gather relevant data from the Wikimedia Foundation’s Analytics Data Lake.

image_suggestions.queries.wiki_sizes = "SELECT wiki_db, COUNT(*) AS size\nFROM wmf_raw.mediawiki_page\nWHERE snapshot='{}'\nAND page_namespace=0\nAND page_is_redirect=0\nGROUP BY wiki_db\n"#

Compute the amount of article pages per wiki, redirects excluded.

image_suggestions.queries.wikidata_items_with_P18 = 'SELECT id AS item_id,\nreplace(regexp_extract(claim.mainSnak.datavalue.value, \'^"(.*)"$\', 1), \' \', \'_\') AS value\nFROM wmf.wikidata_entity\nLATERAL VIEW OUTER explode(claims) AS claim\nWHERE snapshot=\'{}\'\nAND typ=\'item\'\nAND claim.mainSnak.property=\'P18\'\n'#

Gather image Wikidata claims.

regexp_extract removes wrapping quotes from string values and replaces spaces with underscores to match image file names in page_title’s format.

image_suggestions.queries.wikidata_items_with_P373 = 'SELECT id AS item_id,\nreplace(regexp_extract(claim.mainSnak.datavalue.value, \'^"(.*)"$\', 1), \' \', \'_\') AS value\nFROM wmf.wikidata_entity\nLATERAL VIEW OUTER explode(claims) AS claim\nWHERE snapshot=\'{}\'\nAND typ=\'item\'\nAND claim.mainSnak.property=\'P373\'\n'#

Gather Commons category Wikidata claims.

regexp_extract removes wrapping quotes from string values and replaces spaces with underscores to match category names elsewhere.

image_suggestions.queries.wikidata_items_with_P31 = "SELECT id AS item_id,\nfrom_json(claim.mainSnak.dataValue.value, 'entityType STRING, numericId INT, id STRING').id AS value\nFROM wmf.wikidata_entity\nLATERAL VIEW OUTER explode(claims) AS claim\nWHERE snapshot='{}'\nAND typ='item'\nAND claim.mainSnak.property='P31'\n"#

Gather instance of Wikidata claims.

from_json extracts Wikidata QIDs, which are stored as JSON strings in claims.

image_suggestions.queries.commons_pages_with_depicts = "SELECT DISTINCT\nfrom_json(statement.mainsnak.datavalue.value, 'entityType STRING, numericId INT, id STRING').id AS item_id,\nSUBSTRING(id, 2) AS page_id,\nstatement.mainsnak.property AS property_id\nFROM structured_data.commons_entity\nLATERAL VIEW OUTER explode(statements) AS statement\nWHERE snapshot='{}'\nAND statement.mainsnak.property IN ('P180', 'P6243', 'P921')\n"#

Gather Commons depicts statements.

depicts, main subject, and is digital representation of Wikidata properties are all used to represent similar information.

image_suggestions.queries.commons_file_pages = "SELECT page_id, page_title\nFROM wmf_raw.mediawiki_page\nWHERE snapshot='{}'\nAND wiki_db='commonswiki'\nAND page_namespace=6\nAND page_is_redirect=0\n"#

Gather Commons file page IDs and titles.

image_suggestions.queries.local_images = "SELECT wiki_db, page_id, page_title\nFROM wmf_raw.mediawiki_page\nWHERE snapshot='{}'\nAND wiki_db!='commonswiki'\nAND page_namespace=6\nAND page_is_redirect=0\n"#

Gather file page IDs and titles locally stored in wikis.

Gather Commons categories linked to file page IDs.

image_suggestions.queries.categories = "SELECT cat_title, cat_pages\nFROM wmf_raw.mediawiki_category\nWHERE snapshot='{}'\nAND wiki_db='commonswiki'\nAND cat_pages<100000\nAND cat_pages>0\n"#

Gather Commons categories used by less than 100 k pages.

image_suggestions.queries.non_commons_main_pages = "SELECT wiki_db, page_id, page_title\nFROM wmf_raw.mediawiki_page\nWHERE snapshot='{}'\nAND wiki_db!='commonswiki'\nAND page_namespace=0\nAND page_is_redirect=0\n"#

Gather article pages of all wikis but Commons.

Gather all page links.

image_suggestions.queries.pages_with_lead_images = "SELECT wiki_db, pp_page AS page_id, pp_value AS lead_image_title\nFROM wmf_raw.mediawiki_page_props\nWHERE snapshot='{}'\nAND wiki_db!='commonswiki'\nAND pp_propname='page_image_free'\n"#

Gather page IDs with lead image file names from all wikis but Commons.

Gather page IDs linked to Wikidata QIDs.

Gather image file names linked to article pages of all wikis but Commons.

image_suggestions.queries.latest_revisions = "SELECT wiki_db, rev_page AS page_id, MAX(rev_id) AS rev_id\nFROM wmf_raw.mediawiki_revision\nWHERE snapshot='{}'\nGROUP BY wiki_db, rev_page\n"#

Gather page IDs with their latest revisions.

image_suggestions.queries.suggestions_with_feedback = "SELECT wiki, page_id, filename\nFROM event_sanitized.mediawiki_image_suggestions_feedback\nWHERE datacenter!=''\nAND year>=2022 AND month>0 AND day>0 AND hour<24\nAND (is_rejected=True OR is_accepted=True)\n"#

Gather image suggestions’ user feedback.

image_suggestions.queries.cirrus_index_tags = "SELECT wiki, namespace, page_id, weighted_tags\nFROM discovery.cirrus_index_without_content\nWHERE cirrus_replica='codfw'\nAND snapshot='{}'\n"#

Gather Cirrus search index weighted tags available in production. Used as a previous state to compute the search index delta. The expected snapshot is YYYYMMDD.