## ######### ##
## Description : Script en langage R, exécutable sur Linux (pour conserver les caractères non-ASCII), permettant d'extraire automatiquement les données affichées sur les pages de fin de Mush et/ou sur les pages de profils, et ainsi de lister la quasi-totalité des joueurs et parties jouées sur les trois versions de ce jeu. Ce script génère des bases de données au format CSV regroupant pour chaque vaisseau des informations sur la partie, les personnages et les joueurs, ainsi que plusieurs types de statistiques, et pour chaque joueur l'ensemble des trophées qu'il a obtenus, son réseau de groupe et d'amis, son avatar.
## Author : Edward Cage; https://www.edwardcage.pro/
## Version : 2
## Date : 2021-07-27
## Licence : CC BY-NC-SA 3.0; https://creativecommons.org/licenses/by-nc-sa/3.0/
## R 4.1.0
## httr 1.4.2 ; stringi 1.6.2 ; stringr 1.4.0 ; utf8 1.2.1 ; XML 3.99-0.66 ; xml2 1.3.2
## ######### ##
message('Démarrage du script')
# ################## RÉGLAGES ################## #
# Langues
languages <- c("en", "es", "fr")
# Début et fin du webscrapping vaisseaux
ship_start <- 0
ship_end <- 75000
# Récupération des données joueurs (et avatars)
get_users <- TRUE
get_avatars <- TRUE
# Pause minimum entre chaque téléchargement
wait <- 1
# Personnages en doublon
char_dupl_max <- 3
# Préfixes noms des fichiers CSV & images
file_name <- "histo"
csv_ship_name <- "vaisseaux"
csv_list_name <- "liste"
csv_user_name <- "joueurs"
img_dir_name <- "avatars"
# Nombre de lignes par fichier CSV
csv_size <- 1000
# Compression modalités vaisseaux
compress <- TRUE
# ############################################## #
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# ========= PACKAGES NÉCESSAIRES & FONCTIONS =========
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
library(httr)
library(stringi)
library(stringr)
library(utf8)
library(XML)
library(xml2)
download <- function(url){
r <- NULL
w <- wait
while(is.null(r) || is.na(r) || is.integer(r) || toString(r) == "" || !utf8_valid(r)){
suppressMessages(try(r <- rawToChar(GET(url)$content)))
if(!is.null(r)){if(is.character(r)){
if(!utf8_valid(r)){r <- str_replace_all(r, "\uFFFD", "")}
}}
if(w > wait){message(paste0('[', format(Sys.time(), "%X"), '] Échec téléchargement, nouvelle tentative dans ', w, 's.'))}
Sys.sleep(w)
w <- w * 2
}
return(r)
}
html2txt <- function(code){
code <- gsub('&', '&', code)
code <- gsub('>', '>', code)
code <- gsub('<', '<', code)
code <- gsub('"', '"', code)
return(code)
}
twino2iso <- function(date, lang){
month_name_en <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
month_name_es <- c("enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre")
month_name_fr <- c("janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre")
day_name_en <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
day_name_es <- c("Lunes", "Martes", "Miércoles", "Jueves", "Viernes", "Sábado", "Domingo")
day_name_fr <- c("Lundi", "Mardi", "Mercredi", "Jeudi", "Vendredi", "Samedi", "Dimanche")
day_name_sys <- c("lundi", "mardi", "mercredi", "jeudi", "vendredi", "samedi", "dimanche")
assign("month_name", get(paste0("month_name_", lang)))
assign("day_name", get(paste0("day_name_", lang)))
if(
stri_detect(date, regex = '^(Il y a |Hace )?[0-9]{1,2} min( ago)?$') ||
stri_detect(date, regex = "^Aujourd'hui à [0-9]{2}:[0-9]{2}$") ||
stri_detect(date, regex = '^Today at [0-9]{2}:[0-9]{2}$') ||
stri_detect(date, regex = '^Hoy a las [0-9]{2}:[0-9]{2}$')
){
date <- Sys.Date()
}else if(
stri_detect(date, regex = "^Hier à [0-9]{2}:[0-9]{2}$") ||
stri_detect(date, regex = '^Yesterday at [0-9]{2}:[0-9]{2}$') ||
stri_detect(date, regex = '^Ayer a las [0-9]{2}:[0-9]{2}$')
){
date <- Sys.Date() - 1
}else if(
stri_detect(date, regex = "^[A-Za-z]* à [0-9]{2}:[0-9]{2}$") ||
stri_detect(date, regex = '^[A-Za-z]* at [0-9]{2}:[0-9]{2}$') ||
stri_detect(date, regex = '^[A-Za-zéá]* a las [0-9]{2}:[0-9]{2}$')
){
d <- gsub('([A-Za-zéá]*) (at|a las|à) [0-9]{2}:[0-9]{2}', '\\1', date)
if(match(weekdays(Sys.Date()), day_name_sys) > match(d, day_name)){
date <- Sys.Date() - (match(weekdays(Sys.Date()), day_name_sys) - match(d, day_name))
}else{
date <- Sys.Date() - (7 + (match(weekdays(Sys.Date()), day_name_sys) - match(d, day_name)))
}
}else if(stri_detect(date, regex = '^[OnElLe]{2} [0-9]{2}( de)? [A-Za-zéû]*$')){
d <- gsub('[OnElLe]{2} ([0-9]{2})( de)? [A-Za-zéû]*', '\\1', date)
m <- gsub('[OnElLe]{2} [0-9]{2}( de)? ([A-Za-zéû]*)', '\\2', date)
y <- format(Sys.Date(), "%Y")
date <- as.Date(paste0(y, "-", match(m, month_name), "-", d))
}else if(stri_detect(date, regex = '^[OnElLe]{2} [0-9]{2}( de)? [A-Za-zéû]* [0-9]{4}$')){
d <- gsub('[OnElLe]{2} ([0-9]{2})( de)? [A-Za-zéû]* [0-9]{4}', '\\1', date)
m <- gsub('[OnElLe]{2} [0-9]{2}( de)? ([A-Za-zéû]*) [0-9]{4}', '\\2', date)
y <- gsub('[OnElLe]{2} [0-9]{2}( de)? [A-Za-zéû]* ([0-9]{4})', '\\2', date)
date <- as.Date(paste0(y, "-", match(m, month_name), "-", d))
}
return(date)
}
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# ///////// FIN PACKAGES NÉCESSAIRES & FONCTIONS /////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# ========= PRÉPARATION SCRIPT VAISSEAUX, S'EXÉCUTE UNE FOIS =========
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
message('Préparation du scrapping des pages de fin des vaisseaux')
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Données =========
# --- Variables traduction
path_en <- "http://mush.twinoid.com/theEnd/"
path_es <- "http://mush.twinoid.es/theEnd/"
path_fr <- "http://mush.vg/theEnd/"
exp_404_en <- "This ship cannot be found\\. Yet another strike by the dark matter, the Mush or some other ridiculously unlikely but apparently possible entity\\.\\.\\. Sorry :-\\)"
exp_404_es <- "Esta nave está extraviada\\. Debió ser víctima de un accidente, de un Mush u otra graciosa eventualidad\\.\\.\\. Lo sentimos :-\\)"
exp_404_fr <- "Ce vaisseau est introuvable\\. Encore un coup de la matière noire, du Mush ou tout autre explication farfelue générique\\.\\.\\. Désolé :-\\)"
exp_stats_en <- '<table class="summar"><tr><th>Planets found</th><th>Explorations</th><th>Hunters defeated</th><th>Spores generated</th><th>Number of Mush</th><th>Rebel bases contacted</th></tr><tr><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td></tr></table>'
exp_stats_es <- '<table class="summar"><tr><th>Planetas encontrados</th><th>Exploraciones</th><th>Hunters derribados</th><th>Esporas generadas</th><th>Cantidad de Mush</th><th>Bases rebeldes contactadas</th></tr><tr><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td></tr></table>'
exp_stats_fr <- '<table class="summar"><tr><th>Planètes trouvées</th><th>Explorations</th><th>Hunters abattus</th><th>Spores générés</th><th>Nombre de Mush</th><th>Bases rebelles contactées</th></tr><tr><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td><td>[0-9]*</td></tr></table>'
exp_title01_en <- "<th>Ships Commanders</th><td>(?:<div class=\"[^\"]*\"></div><span class=\"charname\">[^<]*</span>)*</td>"
exp_title01_es <- "<th>Los comandantes</th><td>(?:<div class=\"[^\"]*\"></div><span class=\"charname\">[^<]*</span>)*</td>"
exp_title01_fr <- "<th>Les Commandants</th><td>(?:<div class=\"[^\"]*\"></div><span class=\"charname\">[^<]*</span>)*</td>"
exp_title02_en <- "<th>NERON Administrator</th><td>(?:<div class=\"[^\"]*\"></div><span class=\"charname\">[^<]*</span>)*</td>"
exp_title02_es <- "<th>El Administrador NERON</th><td>(?:<div class=\"[^\"]*\"></div><span class=\"charname\">[^<]*</span>)*</td>"
exp_title02_fr <- "<th>L'Administrateur NERON</th><td>(?:<div class=\"[^\"]*\"></div><span class=\"charname\">[^<]*</span>)*</td>"
exp_title03_en <- "<th>Communications Officers</th><td>(?:<div class=\"[^\"]*\"></div><span class=\"charname\">[^<]*</span>)*</td>"
exp_title03_es <- "<th>Los Oficiales de Comunicaciones</th><td>(?:<div class=\"[^\"]*\"></div><span class=\"charname\">[^<]*</span>)*</td>"
exp_title03_fr <- "<th>Les Responsables des Communications</th><td>(?:<div class=\"[^\"]*\"></div><span class=\"charname\">[^<]*</span>)*</td>"
# --- Tablaux traduction
# Morts
death_table <- data.frame(
c(
"Abandoned by the Daedalus to take your chances alone!", "Assassinated by a crewmate!",
"", "Killed by a crafty Mankarog!",
"Injured to death!", "You've burnt yourself to a crisp",
"Hugging allergens!", "Decapitated by some angry dude!",
"Electrocuted by this old heap of metal.", "One-way ticket to the infinite void!",
"Scattered locally by an inquisitive missile!", "",
"The daedalus has returned to Sol!", "Quarantined by NERON",
"Death by impish administrator", "Dead, abandoned in infinite space.",
"Starved to death!", "Dead and lost in space!",
"Dead in a pulverized heap of twisted metal", "Dead on the Daedalus through lack of oxygen!",
"Destroyed by an alien with a headache!", "Succumbed to a generalized infection!",
"Succumbed to an extra-terrestrial disease!", "So badly in fact that you've achieved nothing at all",
"Killed in an accident on an alien planet!", "Another victim claimed by the old Daed-shed",
"Drained of blood", "Your crewmates can't be bothered coming to get you",
"Your intestines clearly needed a good airing", "Your corpse is floating amongst the debris of the pulverized Daedalus",
"You put an end to it all", "You've been abducted",
"You've returned to Sol."
),
c(
"¡Abandonado a tu suerte por el Daedalus!", "¡Asesinado por un compañero!",
"¡Aterrizaje fallido!", "¡Asesinado por un Mankarog!",
"¡Herido de muerte!", "¡Calcinado por un incendio inoportuno!",
"¡Malditos alérgenos!", "¡Decapitado por molesto!",
"¡Electrocutado en este gran bloque de metal!", "¡Bye bye!",
"¡Desaparecido por un misil implacable!", "",
"¡El Daedalus volvió al Sistema Solar!", "Puesto en cuarentena por NERON",
"¡Muerto por un desliz del administrador!", "¡Muerto abandonado en el espacio sideral!",
"¡Muerto de hambre!", "¡Muerto perdido en el espacio!",
"Muerto en una nave destruida.", "¡Muerto en el Daedaulus por falta de oxígeno!",
"¡Pulverizado por un alien con mal humor!", "¡Víctima de una infección generalizada!",
"¡Muerto por una enfermedad extraterrestre!", "Muerto por una tontería.",
"Muerto por un accidente.", "¡Una víctima más de esta nave!",
"¡Desangrado!", "¡A tus compañeros no les dio la gana de rescatarte!",
"¡A tus tripas se les ocurrió salir a tomar aire!", "¡Tu cadáver flota junto a los restos del Daedalus!",
"Pusiste fin a tus días...", "Fuiste secuestrado por extraterrestres.",
"Has vuelto al Sistema Solar."
),
c(
"Abandonné à votre pauvre sort par le Daedalus !", "Assassiné par un équipier !",
"Atterrissage raté !", "Aventurier Trop curieux",
"Blessé à mort !", "Calciné par un incendie inopportun !",
"Calin avec un allergène", "Décapité par un facheux !",
"Electrocuté par ce vieux tas de féraille !", "En route vers le vide infini !",
"Eparpillé par un missile inquisiteur !", "Immolé par le feu !",
"Le Daedalus est rentré sur Sol !", "Mis en quarantaine par NERON",
"Mort à cause d'un Administrateur Taquin !", "Mort abandonné dans l'espace inter sidéral !",
"Mort de faim !", "Mort perdu dans l'espace infini !",
"Mort présent dans un appareil annihilé.", "Mort sur le Daedalus par manque d'oxygène !",
"Pulvérisé par un alien mal réveillé !", "Succombé à une infection massive !",
"Succombé à une maladie extra-terrestre !", "Tellement mal en point qu'un rien vous a achevé",
"Tué par un accident dans un monde alien !", "Victime de cette vieille bicoque !",
"Vidé de votre sang !", "Vos co-équipiers avaient la flemme de venir vous chercher !",
"Vos tripes avaient besoin de prendre l'air !", "Votre cadavre flotte parmi les débris du Daedalus pulvérisé !",
"Vous avez mis fin à vos jours...", "Vous avez été enlevé.",
"Vous êtes rentré sur Sol."
)
)
# Prix Nova
nova_table <- data.frame(
c(
"Golden Supernova Award", "Supernova of Excellence Award", "Honorable Supernova Award",
"Discovered Supernova Award", "Special Nova Award", "Prize Nova"
),
c(
"Premio Super Nova de Oro", "Premio Super Nova del Valor", "Premio Super Nova de Honor",
"Premio Super Nova Sideral", "Premio Nova Especial", "Premio Nova"
),
c(
"Prix Super Nova d'Or", "Prix Super Nova d'Excellence", "Prix Super Nova d'Honneur",
"Prix Super Nova Révélée", "Prix Nova Spécial", "Prix Nova"
)
)
# Surnoms
surnames_table <- data.frame(
c(
"Workshy Waster", "Anonymous Hacker", "Decapitated Mushroom", "Two left hands",
"Energy Dissipator", "Sleeparoo", "Failed Start", "Radio Silence Engaged", "",
"", "Stealthy Swindler", "Gardon Diem", "", "", "I shouldn't have come",
"Pointless Player", "Violence is Evil", "Half Ninja, Half Snake", "Brute Force Mutant", "Battlestar Class Pilot",
"Once your appetite goes", "", "Super Optimizer", ""
),
c(
"", "Trabajador del año", "Hongo decapitado", "Manos inútiles",
"Energía desperdiciada", "Dormilón caradura", "Diseño incompleto", "Me comieron la lengua", "",
"", "Bribón enmascarado", "Gardon Diem", "Vago invertebrado", "", "No debí venir",
"Gamer al cubo", "Pacifista", "", "Mutante Fuerza Bruta", "Piloto Battlestar",
"Qué comemos hoy", "", "Super Optimizador", ""
),
c(
"Aye Aye", "Bidouilleur Anonymous", "Champignon décapité", "Deux-mains-gauches",
"Dilapidateur d'énergie", "Dormeur indolent", "Ébauche inachevée", "En mode silence-radio", "F117A pas discret",
"Foetus", "Fourbasse masquée", "Gardon Diem", "Glandeur invertébré", "Gynéco ( Le Doc... )", "J'aurais pas dû venir",
"Joueur useless", "La violence c'est le mal", "Mi ninja mi anguille", "Mutant Force Brute", "Pilote classe Battlestar",
"Quand l'appétit va", "Starbuck ( pas le café )", "SuperOptimisator", "Technicien omnipotent"
)
)
# --- Compression
# Morts
death_compress <- data.frame(
c(
"Abandonné à votre pauvre sort par le Daedalus !", "Assassiné par un équipier !",
"Atterrissage raté !", "Aventurier Trop curieux",
"Blessé à mort !", "Calciné par un incendie inopportun !",
"Calin avec un allergène", "Décapité par un facheux !",
"Electrocuté par ce vieux tas de féraille !", "En route vers le vide infini !",
"Eparpillé par un missile inquisiteur !", "Immolé par le feu !",
"Le Daedalus est rentré sur Sol !", "Mis en quarantaine par NERON",
"Mort à cause d'un Administrateur Taquin !", "Mort abandonné dans l'espace inter sidéral !",
"Mort de faim !", "Mort perdu dans l'espace infini !",
"Mort présent dans un appareil annihilé.", "Mort sur le Daedalus par manque d'oxygène !",
"Pulvérisé par un alien mal réveillé !", "Succombé à une infection massive !",
"Succombé à une maladie extra-terrestre !", "Tellement mal en point qu'un rien vous a achevé",
"Tué par un accident dans un monde alien !", "Victime de cette vieille bicoque !",
"Vidé de votre sang !", "Vos co-équipiers avaient la flemme de venir vous chercher !",
"Vos tripes avaient besoin de prendre l'air !", "Votre cadavre flotte parmi les débris du Daedalus pulvérisé !",
"Vous avez mis fin à vos jours...", "Vous avez été enlevé.",
"Vous êtes rentré sur Sol."
),
c(
"Abandonné", "Assassiné",
"Patrouilleur pulvérisé", "Tué par Mankarog Chafouin !",
"Blessures...", "Brûlé",
"Allergie", "Décapité",
"Electrocuté", "Assassinés par NERON",
"Roquetté", "Immolé",
"Daedalus rentré sur Sol", "Mis en quarantaine par NERON",
"Super Nova", "Dans l'espace sans pouvoir respirer",
"Famine", "Aucune infirmerie disponible",
"Combat spatial", "Plus d'oxygène",
"Aventurier pas assez combatif", "Septicémie",
"Morsure Noire", "Circonstances funestes",
"Aventurier malchanceux", "Plaque de métal",
"Saigné", "Aventurier perdu",
"Auto-Extrait", "Daedalus détruit",
"Dépression fatale", "Enlevé par des races supérieures",
"Daedalus s'est installé sur Eden"
)
)
# Prix Nova
nova_compress <- data.frame(
c(
"Prix Super Nova d'Or", "Prix Super Nova d'Excellence", "Prix Super Nova d'Honneur",
"Prix Super Nova Révélée", "Prix Nova Spécial", "Prix Nova"
),
c(
"5", "4", "3",
"2", "1", "0"
)
)
# --- Tableaux de correspondance
# Personnages
char_var_table <- data.frame(
c(
"admin",
"andie", "chao", "chun", "derek", "eleesha", "finola",
"frieda", "gioele", "hua", "ian", "janice", "jinsu",
"kuanti", "paola", "raluca", "roland", "stephen", "terrence"
),
c(
"Admin",
"Andie Graham", "Wang Chao", "Zhong Chun", "Derek Hogan", "Eleesha Williams", "Finola Keegan",
"Frieda Bergmann", "Gioele Rinaldo", "Jiang Hua", "Ian Soulton", "Janice Kent", "Kim Jin Su",
"Lai Kuan-Ti", "Paola Rinaldo", "Raluca Tomescu", "Roland Zuccali", "Stephen Seagull", "Terrence Archer"
),
c(
"Admin",
"Andie", "Chao", "Chun", "Derek", "Eleesha", "Finola",
"Frieda", "Gioele", "Hua", "Ian", "Janice", "Jin Su",
"Kuan-Ti", "Paola", "Raluca", "Roland", "Stephen", "Terrence"
)
)
# Recherches
research_var_table <- data.frame(
c(
"anabolisant", "bacterie", "patuline", "meridon",
"mycoscan", "distillateur", "natamy", "gaz",
"hydratation", "langage", "lentille", "mush_hunter",
"mycoalarm", "pheromodem", "pommade", "races",
"savon", "constipaspore", "retro_fongique", "suceur",
"calculateur", "reproduction", "tesla"
),
c(
"steroids", "mushophage_bacteria", "patuline_scrambler", "fungus_scrambler",
"mycoscan", "drug_dispenser", "natamy_gun", "antispore_gaz",
"infinite_water", "myco_dialect", "ncc_lens", "mush_predator",
"myco_alarm", "pheromodem", "healing_ointmant", "mush_breeds",
"mushicide_soap", "constipaspore_serum", "anti_mush_serum", "spore_extractor",
"super_calc", "mush_breeding_system", "tesla_sup2x"
),
c(
"Anabolisant", "Bactérie mushovore", "Brouilleur à patuline", "Brouilleur Meridon",
"Création du Mycoscan", "Distillateur de stupéfiant", "Fusil Natamy", "Gaz antispore",
"Hydratation Perpétuelle", "Langage Mush", "Lentille NCC", "Le Mush-Hunter : ZC16H",
"MycoAlarm", "Pheromodem", "Pommade ultra-cicatrisante", "Races Mush",
"Savon Mushicide", "Sérum de constipaspore", "Sérum Retro-Fongique", "Suceur de Spore",
"Super Calculateur", "Système de reproduction Mush", "Tesla SUP2X"
)
)
# Projets
project_var_table <- data.frame(
c(
"acceleration", "agrandissement", "arroseurs", "bouclier",
"canon", "conduite", "coursives", "couveuse",
"demantellement", "detecteur_ondes", "detecteur_anomalie", "detecteur_pistons",
"detecteur_incendie", "distributeur", "drone", "filet",
"hydropots", "lampes", "lavabo", "participation",
"portail", "antigrav", "decollage", "pulsateur",
"radar", "rafistolage", "rapatriement", "reducteur",
"reservoir", "terminaux", "torrefacteur", "chauffage",
"cuisine", "dynarcade", "isolateur", "jukebox",
"coccinelles", "actopi", "debris", "thalasso",
"visee"
),
c(
"chipset_acceleration", "icarus_larger_bay", "auto_watering", "plasma_shield",
"patrolship_blaster_gun", "oxy_more", "armour_corridor", "hydroponic_incubator",
"dismantling", "quantum_sensors", "equipment_sensor", "door_sensor",
"fire_sensor", "food_retailer", "extra_drone", "magnetic_net",
"extra_hydroponpots", "heat_lamp", "icarus_lavatory", "neron_project_thread",
"bay_door_xxl", "icarus_antigrav_propeller", "patrol_ship_launcher", "turret_extra_firerate",
"radar_trans_void", "bric_broc", "auto_return_icarus", "trail_reducer",
"patrolship_extra_ammo", "auxiliary_terminal", "fission_coffee_roaster", "floor_heating",
"apero_kitchen", "call_of_dirty", "noise_reducer", "beat_box",
"parasite_elim", "whos_who", "trash_load", "thalasso", "neron_targeting_assist"
),
c(
"Accélération du processeur", "Agrandissement de la cale", "Arroseurs automatiques", "Bouclier plasma",
"Canon blaster", "Conduite Oxygénées", "Coursives blindées", "Couveuse hydroponique",
"Démantellement", "Détecteur à onde de probabilité", "Détecteur d'anomalie", "Détecteurs de pistons défectueux",
"Détecteurs d'incendie", "Distributeur pneumatique de nourriture", "Drone supplémentaire", "Filet magnetique",
"Hydropots supplémentaires", "Lampes à chaleur", "Lavabo opportun", "Participation de NERON",
"Portail de décollage extra-large", "Propulseur Antigrav", "Propulseur de décollage", "Pulsateur inversé",
"Radar à onde spatiale", "Rafistolage Général", "Rapatriement magnétique", "Réducteur de traînée",
"Réservoir de Teslatron", "Terminaux auxiliaires", "Torréfacteur à fission", "Chauffage au sol",
"Cuisine SNC", "Dynarcade", "Isolateur phonique", "Jukebox",
"Nano-coccinelles", "Protocole ACTOPI", "Tas de débris", "Thalasso",
"Visée Heuristique"
)
)
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Calculs et déclarations var =========
lang_var <- c("path", "exp_404", "exp_stats", "exp_title01", "exp_title02", "exp_title03")
count <- 0
ship_n <- (ship_end - ship_start + 1)
ship_t <- (ship_end - ship_start + 1) * length(languages)
nrow_char <- nrow(char_var_table)
research_nrow <- nrow(research_var_table)
project_nrow <- nrow(project_var_table)
surnames_null <- data.frame(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)
names(surnames_null) <- surnames_table[,3]
data <- NULL
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# ///////// FIN PRÉPARATION SCRIPT VAISSEAUX /////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# ========= BOUCLE S'EXÉCUTANT UNE FOIS PAR LANGUE ET PAR VAISSEAU =========
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
message('Début du scrapping...')
for(lang in languages){
for(lang_v in lang_var){assign(lang_v, get(paste0(lang_v, "_", lang)))}
user_list <- NULL
count <- count + 1
count_lines <- 0
for(ship in ship_start:ship_end){
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Page de fin =========
# --- Téléchargement de la page de fin
url <- paste0(path, ship)
content <- download(url)
# --- Nettoyage du code HTML
content <- gsub('(\\\\r|\\\\n|\\\\t)', '', content)
content <- gsub('(\\r|\\n|\\t)', '', content)
content <- gsub('<img src="[^%"]*(%0D|%0A|%09)+', '', content)
content <- gsub('</div>(%0D|%0A|%09)*</blockquote>', '</p></div></blockquote>', content)
content <- gsub('(<div class="triumph[^"]*">|<ul class="ul">|<div class="inl-blck">)', '</p></div></blockquote>\\1', content)
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Vaisseau =========
# --- Page de fin trouvée
if(str_detect(content, exp_404)){
available <- "0"
}else{
available <- "1"
}
# --- Type de fin
if(str_detect(content, '<div class="destroyed"></div>')){
end <- "Détruit"
}else if(str_detect(content, '<div class="sol"></div>')){
end <- "Sol"
}else if(str_detect(content, '<div class="eden"></div>')){
end <- "Eden"
}else{
end <- ""
}
# --- Création data.frame vaisseau
game <- data.frame(lang, ship, available, end)
names(game) <- c("Langue", "ID", "Page trouvée", "Fin")
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Personnages =========
# --- Séparation des boites personnages
xml <- read_xml(content, as_html = TRUE)
xml_1 <- xml_find_all(xml, "//div[@class='boxstar cdBoxStar']")
xml_2 <- xml_find_all(xml, "//div[@class='boxguest cdBoxGuest']")
xml_3 <- xml_find_all(xml, "//div[@class='boxextra cdBoxExtra']")
all_char <- NULL
for(x in xml_1){all_char <- append(all_char, toString(x))}
for(x in xml_2){all_char <- append(all_char, toString(x))}
for(x in xml_3){all_char <- append(all_char, toString(x))}
all_char <- gsub("\\n", "", all_char)
# --- Nombre de personnages présents
n_char <- as.numeric(length(all_char))
# --- Réinitionalisation var personnages
for(char_var in char_var_table[,1]){
for(char_dupl in 1:char_dupl_max){
assign(paste0(char_var , char_dupl), 0)
assign(paste0(char_var , char_dupl, "_playerid"), "")
assign(paste0(char_var , char_dupl, "_playername"), "")
assign(paste0(char_var , char_dupl, "_likes"), "")
assign(paste0(char_var , char_dupl, "_triumph"), "")
assign(paste0(char_var , char_dupl, "_mush"), "")
assign(paste0(char_var , char_dupl, "_message"), "")
assign(paste0(char_var , char_dupl, "_death"), "")
assign(paste0(char_var , char_dupl, "_nova"), "")
}
}
# --- Boucle s'éxecutant pour chaque personnage présent sur la page de fin
for(char in all_char){
# Suppression des caractères suivant la fin de la boite personnage (en cas de commentaire buggué)
char <- gsub('<a href=\"#\" [^>]*><img src=\"/img/icons/ui/notes.gif\"></a>.*', '', char)
# Sélection de la variable personnage, selon son nom
for(r in 1:nrow_char){
char_cn <- char_var_table[r,2]
if(str_detect(char, paste0("<h3>", char_cn))){
char_varname <- as.character(char_var_table[r,1])
break
}
}
for(char_dupl in 1:char_dupl_max){
if(get(paste0(char_varname, char_dupl)) == 0){break}
}
char_var <- paste0(char_varname, char_dupl)
# Présence du personnage dans le vaisseau
assign(char_var, 1)
# Joueur : ID, nom, nombre de likes
c <- stri_match_first_regex(char, "tid_id=\"([0-9]*)\">([^<]*)</a><div[^>]*>([0-9]*)")
assign(paste0(char_var, "_playerid"), c[2])
if(is.null(user_list)){user_list <- c[2]}else{user_list <- c(user_list, c[2])}
assign(paste0(char_var, "_playername"), html2txt(c[3]))
assign(paste0(char_var, "_likes"), c[4])
# Triomphe
assign(paste0(char_var, "_triumph"), stri_match_first_regex(char, "<div class=\"(?:triumph|score) ?(?:triumphmush)?\">([0-9]*)")[2])
# Camp
if(stri_detect(char, regex = " triumphmush") || stri_detect(char, regex = "mush_triumph.png")){
if(compress){
assign(paste0(char_var, "_mush"), "M")
}else{
assign(paste0(char_var, "_mush"), "mush")
}
}else{
if(compress){
assign(paste0(char_var, "_mush"), "H")
}else{
assign(paste0(char_var, "_mush"), "humain")
}
}
# Cause de la mort
if(str_detect(char, "<div class=\"triumph")){
death <- ""
}else{
death <- stri_match_first_regex(
char,
"<div class=\"[^\"]*\"><div class=\"score\">[0-9]*<img [^>]*></div><p>(?:<img [^>]*>)? *([^<]*)</p></div>"
)[,2]
if(lang != "fr"){
if(lang == "en"){
death_t <- as.character(death_table[which(death_table[,1] == death),3])
}else if(lang == "es"){
death_t <- as.character(death_table[which(death_table[,2] == death),3])
}
if(!is.na(death_t) && length(death_t) != 0){death <- death_t}
}
}
if(compress){
death_t <- as.character(death_compress[which(death_compress[,1] == death),2])
if(!is.na(death_t) && length(death_t) != 0){death <- death_t}
}
assign(paste0(char_var, "_death"), death)
# Prix Nova
nova <- stri_match_first_regex(char, "<div class=\"nova\"[^>]*><img [^>]*> *([A-Za-z 'é]+)</div>")[,2]
if(lang != "fr"){
if(lang == "en"){
nova_t <- as.character(nova_table[which(nova_table[,1] == nova),3])
}else if(lang == "es"){
nova_t <- as.character(nova_table[which(nova_table[,2] == nova),3])
}
if(!is.na(nova_t) && length(nova_t) != 0){nova <- nova_t}
}
if(compress){
nova_t <- as.character(nova_compress[which(nova_compress[,1] == nova),2])
if(!is.na(nova_t) && length(nova_t) != 0){nova <- nova_t}
}
assign(paste0(char_var, "_nova"), nova)
# Message de fin
assign(paste0(char_var, "_message"),
html2txt(
gsub("</?(p|br)>", " ",
stri_match_first_regex(
gsub("»?(</p>)? *(</div>)? *(\n)? *(</blockquote>)? *$", "",
toString(xml_find_all(read_xml(
gsub("</?span [^>]*>", "",
gsub("</?em>", "//", gsub("</?strong>", "**", gsub("</?cite>", "''",
gsub("<img src=\"[^\"]*\" alt=\"([^\"]*)\" class=\"tid_ico\">", "\\1",
char
)
)))
),
as_html = TRUE), "//blockquote")[1])
),
"<p>«?(.*)", cg_missing = ""
)[2]
)
)
)
}
# --- Création data.frame personnages
characters <- data.frame(n_char)
names(characters) <- c("Personnages")
for(r in 1:nrow_char){
char_var <- char_var_table[r,1]
char_name <- as.character(char_var_table[r,3])
for(char_dupl in 1:char_dupl_max){
char_data <- data.frame(
get(paste0(char_var , char_dupl)),
get(paste0(char_var , char_dupl, "_playerid")),
get(paste0(char_var , char_dupl, "_playername")),
get(paste0(char_var , char_dupl, "_likes")),
get(paste0(char_var , char_dupl, "_triumph")),
get(paste0(char_var , char_dupl, "_nova")),
get(paste0(char_var , char_dupl, "_mush")),
get(paste0(char_var , char_dupl, "_death")),
get(paste0(char_var , char_dupl, "_message"))
)
names(char_data) <- c(
paste0(char_name, char_dupl),
paste0(char_name, char_dupl, " joueurID"),
paste0(char_name, char_dupl, " joueurPseudo"),
paste0(char_name, char_dupl, " likes"),
paste0(char_name, char_dupl, " triomphe"),
paste0(char_name, char_dupl, " nova"),
paste0(char_name, char_dupl, " humain/mush"),
paste0(char_name, char_dupl, " mort"),
paste0(char_name, char_dupl, " message")
)
characters <- cbind(characters, char_data)
if(available == "0"){characters[characters == 0] <- NA}
}
}
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Casting =========
casting <- stri_match_first_regex(
content,
"<h2 id=\"producer\">[^<]*<a href=\"/group/([0-9]*)\">([^<]*)</a></h2>",
cg_missing = ""
)
casting_id <- casting[2]
casting_name <- html2txt(casting[3])
casting <- data.frame(casting_id, casting_name)
names(casting) <- c("Casting ID", "Casting Nom")
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Stats =========
all_stats <- str_extract_all(str_extract(content, exp_stats), '[0-9]+', simplify = TRUE)
stats <- data.frame(
all_stats[1], all_stats[2], all_stats[3],
all_stats[4], all_stats[5], all_stats[6]
)
names(stats) <- c(
"Planètes trouvées", "Explorations", "Hunters abattus",
"Spores générés", "Nombre de Mush", "Bases rebelles contactées"
)
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Études =========
# --- Recherches
research_df <- NULL
for(i in 1:research_nrow){
research_var <- as.character(research_var_table[i,1])
research_image <- research_var_table[i,2]
if(str_detect(content, paste0('/img/cards/research/', research_image, '.png'))){
assign(research_var, 1)
}else{
assign(research_var, 0)
}
research_data <- data.frame(get(research_var))
research_name <- as.character(research_var_table[i,3])
names(research_data) <- c(research_name)
if(is.null(research_df)){research_df <- research_data}else{research_df <- cbind(research_df, research_data)}
}
research_n <- data.frame(rowSums(research_df))
names(research_n) <- c("Recherches")
research_df <- cbind(research_n, research_df)
# --- Projets
project_df <- NULL
for(i in 1:project_nrow){
project_var <- as.character(project_var_table[i,1])
project_image <- project_var_table[i,2]
if(str_detect(content, paste0('/img/cards/projects/', project_image, '.png'))){
assign(project_var, 1)
}else{
assign(project_var, 0)
}
project_data <- data.frame(get(project_var))
project_name <- as.character(project_var_table[i,3])
names(project_data) <- c(project_name)
if(is.null(project_df)){project_df <- project_data}else{project_df <- cbind(project_df, project_data)}
}
project_n <- data.frame(rowSums(project_df))
names(project_n) <- c("Projets")
project_df <- cbind(project_n, project_df)
# --- Pilgred
if(str_detect(content, '/img/cards/pilgred.png')){pilgred <- 1}else{pilgred <- 0}
pilgred_df <- data.frame(pilgred)
names(pilgred_df) <- c("PILGRED")
# --- Création data.frame études
studies <- cbind(research_df, project_df, pilgred_df)
if(available == "0"){studies[studies == 0] <- NA}
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Titres =========
# --- Commandant
title01 <- stri_match_all_regex(
stri_extract(
content,
regex = exp_title01
),
"<span class=\"charname\">([^<]*)</span>",
cg_missing = NA
)[[1]][,2]
if(is.na(title01)){
if(available != "0"){title01_n <- 0}else{title01_n <- NA}
title01_char <- NA
}else{
title01_n <- length(title01)
title01_char <- toString(title01)
}
# --- Responsable des Communication
title03 <- stri_match_all_regex(
stri_extract(
content,
regex = exp_title03
),
"<span class=\"charname\">([^<]*)</span>",
cg_missing = NA
)[[1]][,2]
if(is.na(title03)){
if(available != "0"){title03_n <- 0}else{title03_n <- NA}
title03_char <- NA
}else{
title03_n <- length(title03)
title03_char <- toString(title03)
}
# --- Administrateur NERON
title02 <- stri_match_all_regex(
stri_extract(
content,
regex = exp_title02
),
"<span class=\"charname\">([^<]*)</span>",
cg_missing = NA
)[[1]][,2]
if(is.na(title02)){
if(available != "0"){title02_n <- 0}else{title02_n <- NA}
title02_char <- NA
}else{
title02_n <- length(title02)
title02_char <- toString(title02)
}
# --- Création data.frame titres
titles <- data.frame(title01_n, title01_char, title02_n, title02_char, title03_n, title03_char)
names(titles) <- c(
"Commandant nombre", "Commandant personnages",
"Admin NERON nombre", "Admin NERON personnages",
"Resp Comm nombre", "Resp Comm personnages"
)
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Surnoms =========
# --- Extraction des lignes contenant les surnoms
surnames_lines <- str_extract_all(
str_extract(
content,
"<table class=\"summar daed\">(?:<tr><th[^>]*> ?[^<]* ?</th><td><div[^>]*></div><span class=\"charname\">[^<]*</span></td></tr>){5}</table>"
),
"<tr><th[^>]*> ?[^<]* ?</th><td><div[^>]*></div><span class=\"charname\">[^<]*</span></td></tr>",
simplify = TRUE
)
surnames_lines <- gsub('(\\\\xc2|\\\\xab|\\\\xa0|\\\\xbb|«|»|\\\\xbf|¿)', '', surnames_lines)
# --- Pour chaque ligne, extraction surnom et personnage, ajout au data.frame regroupant les 5 surnoms
surnames <- NULL
for(surname_line in surnames_lines){
surname_line <- gsub(
" ?;", ";",
gsub(
"<tr><th[^>]*> ?([^<]*) ?</th><td><div[^>]*></div><span class=\"charname\">([^<]*)</span></td></tr>",
"\\1;\\2",
surname_line
)
)
surname_n <- gsub("^[^A-ZÉa-zéíñóúû1-9'()-]*", "", gsub("[^A-ZÉa-zéíñóúû1-9'()-]*$", "",
gsub("[[:space:]]", " ", gsub("^([^;]*);.*$", "\\1", surname_line))
))
surname_c <- gsub("^[^;]*;(.*)$", "\\1", surname_line)
surname <- data.frame(surname_c)
names(surname) <- surname_n
if(is.null(surnames)){surnames <- surname}else{surnames <- cbind(surnames, surname)}
}
# --- Si aucun surnom, data.frame par défaut, sinon ajout des valeurs (traduites) du data.frame des surnoms trouvés
if(is.na(surnames[1])){
surnames <- surnames_null
}else{
if(lang != "fr"){
surname_name_t <- NULL
for(surnames_name in names(surnames)){
if(lang == "en"){
surname_nt <- as.character(surnames_table[which(surnames_table[,1] == surnames_name),3])
}else if(lang == "es"){
surname_nt <- as.character(surnames_table[which(surnames_table[,2] == surnames_name),3])
}
if(is.null(surname_name_t)){surname_name_t <- surname_nt}else{surname_name_t <- append(surname_name_t, surname_nt)}
}
names(surnames) <- surname_name_t
}
surnames <- merge.data.frame(surnames_null, surnames, all.y = TRUE)
surnames <- surnames[,order(colnames(surnames))]
}
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Opérations de fin de boucle =========
# --- Date de fin du webscrapping
date <- data.frame(Sys.time())
names(date) <- "Date"
# --- Compilation des données vaisseau, puis ajout à la BDD globale
ship_data <- cbind(game, casting, characters, stats, studies, titles, surnames, date)
if(is.null(data)){data <- ship_data}else{data <- rbind(data, ship_data)}
remove(ship_data)
gc(verbose = FALSE)
# --- Affichage n°
cat(paste0("Vaisseaux ", floor((ship - ship_start + 1 + ((count - 1) * ship_n)) / ship_t * 100), "% : ", lang, "-", ship), fill = TRUE)
# --- Création fichiers CSV puis supression données, si nombre de ligne atteint ou dernier vaisseau
count_lines <- count_lines + 1
if(count_lines / csv_size == round(count_lines / csv_size) || ship == ship_end){
message(paste0('Création du fichier ', file_name, csv_ship_name, "_", lang, ship, ".csv"))
write.csv(
data,
paste0(file_name, csv_ship_name, "_", lang, ship, ".csv"),
row.names = FALSE,
fileEncoding = "UTF-8",
na = ""
)
user_list <- data.frame(unique(user_list))
write.table(
user_list,
paste0(file_name, csv_list_name, "_", lang, ".csv"),
row.names = FALSE, col.names = FALSE,
fileEncoding = "UTF-8",
sep = ",", na = "",
append = TRUE
)
remove(data, user_list)
remove(
all_stats, stats,
surname, surnames, surnames_lines,
content, url,
x, xml, xml_1, xml_2, xml_3,
game, available, end, date,
casting, casting_id, casting_name,
c, char_data, characters,
all_char, char, char_cn, char_dupl, char_name, char_var, char_varname, n_char,
death, death_t, nova, nova_t,
studies,
pilgred_df, project_data, project_df, project_n, research_data, research_df, research_n,
pilgred, project_image, project_name, project_var, research_image, research_name, research_var,
surname_c, surname_line, surname_n, surname_name_t, surname_nt, surnames_name,
titles, title01, title01_char, title01_n, title02, title02_char, title02_n, title03, title03_char, title03_n
)
gc(verbose = FALSE, full = TRUE)
data <- NULL
user_list <- NULL
}
}
chk_loop <- TRUE
}
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# ///////// FIN BOUCLE /////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# ################## SI SCRIPT JOUEURS ################## #
# Exécution ou non de la suite du script, selon réglages
# ####################################################### #
if(get_users && chk_loop){
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# ========= PRÉPARATION SCRIPT JOUEURS, S'EXÉCUTE UNE FOIS =========
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
message('Préparation du scrapping des pages de profil des joueurs')
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Suppression des variables inutiles =========
if(exists("char_var_table") && exists("char_dupl_max")){
for(char_var in char_var_table[,1]){
for(char_dupl in 1:char_dupl_max){
remove(list = paste0(char_var , char_dupl))
remove(list = paste0(char_var , char_dupl, "_playerid"))
remove(list = paste0(char_var , char_dupl, "_playername"))
remove(list = paste0(char_var , char_dupl, "_likes"))
remove(list = paste0(char_var , char_dupl, "_triumph"))
remove(list = paste0(char_var , char_dupl, "_mush"))
remove(list = paste0(char_var , char_dupl, "_message"))
remove(list = paste0(char_var , char_dupl, "_death"))
remove(list = paste0(char_var , char_dupl, "_nova"))
}
}
}
if(exists("research_nrow") && exists("research_var_table")){
for(i in 1:research_nrow){
research_var <- as.character(research_var_table[i,1])
remove(list = research_var)
}
}
if(exists("project_nrow") && exists("project_var_table")){
for(i in 1:project_nrow){
project_var <- as.character(project_var_table[i,1])
remove(list = project_var)
}
}
remove(
ship_start, ship_end, ship, ship_n, ship_t,
count, count_lines,
lang, lang_v, lang_var,
char_dupl_max,
csv_ship_name,
exp_404, exp_404_en, exp_404_es, exp_404_fr, path, path_en, path_es, path_fr,
exp_stats_en, exp_stats_es, exp_stats_fr,
exp_title01_en, exp_title01_es, exp_title01_fr, exp_title02_en, exp_title02_es, exp_title02_fr, exp_title03_en, exp_title03_es, exp_title03_fr,
death_table, nova_table, surnames_table, char_var_table, research_var_table, project_var_table,
nrow_char, project_nrow, research_nrow,
death_compress, nova_compress
)
gc(verbose = FALSE, full = TRUE)
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Données =========
# --- Variables traduction
s_en <- "77"
s_es <- "76"
s_fr <- "41"
exp_404_en <- "<title>18 great games brought to you by Motion Twin - Twinoid</title>"
exp_404_es <- "<title>18 divertidos juegos en línea, ¡nada que descargar y gratis! - Twinoid</title>"
exp_404_fr <- "<title>24 jeux gratuits signés Motion Twin en 1 seul compte - Twinoid</title>"
exp_likes_en <- '<div class=\"name\"><span class=\"count\">[0-9]*<em>x</em></span>Everybody loves you!</div>'
exp_likes_es <- '<div class=\"name\"><span class=\"count\">[0-9]*<em>x</em></span>¡Todo el mundo te ama!</div>'
exp_likes_fr <- '<div class=\"name\"><span class=\"count\">[0-9]*<em>x</em></span>Tout le monde vous aime !</div>'
# --- Tablaux trophées traduction
goals_table <- data.frame(
c(
"Read all about it!", "Frozen food eaten",
"You have already supported your community by sending a gift to another player, bravo!", "General Announcement",
"Apprenticeship Completed", "Skill books",
"Quartermaster", "Artefacts collected",
"Space Adventurer", "Legendary Adventurer",
"Gagged", "Total Clearance",
"Butcher", "Plasma Shield",
"Shroom Killer", "Mushroom Blitz",
"Hidden Shroom", "Mature Mushroom",
"Precocious Mushroom", "Cat Cuddles",
"Researcher", "Surgeon",
"Designer", "Crystallite Shards",
"Enigma Decoder", "Last Commandant",
"Last Man Standing", "Eden, the contaminated land",
"Eden, the promised land", "All aboard the Daedalus!.",
"Research Team", "Technical Team",
"Explorer", "Fire Extinguisher",
"Grenadier", "",
"Space Hero", "Legendary Hero",
"Politician", "Hunters Defeated",
"Infected", "Contaminator",
"", "Successful Inspections",
"Day 10 Reached", "Day 15 Reached",
"Day 20 Reached", "Day 30 Reached",
"Day 5 Reached", "Days without sleep",
"Judgement", "Kivanç Terzi contacted",
"Little treats you've helped yourself to!", "Disease Contracted",
"", "Missions",
"Natamist", "Nils contacted",
"Number of cycles won contemplating your work", "Number of cycles won running round in circles",
"Number of cycles won... listening to others", "Number of cycles spent sulking in a corner.",
"Number of cycles spent fixing your barnet fair.", "Number of turns spent counting your cash",
"Number of cycles spent cooking up suspect dishes", "Number of cycles spent cultivating psychotropic plants",
"Number of cycles spent taking stuff out", "Number of cycles spent as an intergalactic Don Juan.",
"Number of cycles won, spent with the others", "Number of cycles won playing as the chief",
"Number of cycles won reading the cards", "Number of cycles spent stroking your cat",
"Number of cycles spent fungifying in a dark corner", "Number of cycles examining the infinitely small",
"Number of cycles you've spent providing blood samples", "Number of cycles spent surfing the lowest frequencies",
"Number of cycles spent honoring the SDF.", "New Generations",
"Physician's Core", "Bigger by the day",
"Part Timer", "PILGRED IS BACK!",
"", "Planets Detected",
"Plantations", "Dishes Cooked",
"Doors repaired", "Complete Psychoanalysis",
"Rations eaten", "Cooked rations eaten",
"Rebellion!", "Communications Expert",
"Revolution!", "",
"Breakdown Reporter", "Fire Reports",
"Sol", "Soldier of Humanity",
"", "Psychological Support",
"Artefact Specialist", "Caffeine Junkie",
"Supporter", "Neron is watching you",
"Telegenius", "Triple Victory",
"16 coffees please!", "Old Friends",
"Longest Voyage"
),
c(
"Contacto rebelde", "Alimentos congelados consumidos",
"Has sido generoso enviando un regalo a un jugador. ¡Bravo!", "Anuncio general",
"Instrucción terminada", "Librotrones",
"Abastecedor", "Artefactos colectados",
"Aventurero del Espacio", "Aventurero legendario",
"Amordazado", "Sectores revelados",
"Carnicero", "Escudo plasma",
"Verdugo", "Hongo ácido",
"Hongo oculto", "Hongo maduro",
"Hongo precoz", "Gatos acariciados",
"Investigador", "Cirujano",
"Creador", "",
"Revelador de enigmas", "Último comandante",
"Último superviviente", "Edén, tierra contaminada",
"Edén, tierra prometida", "",
"Equipo de investigadores", "Equipo técnico",
"Explorador", "Extinción de Incendios",
"Granadero", "",
"Héroe Espacial", "Héroe legendario",
"Político", "Hunters derribados",
"Infectado", "Infectador",
"", "Inspecciones exitosas",
"Día 10 alcanzado", "Día 15 alcanzado",
"Día 20 alcanzado", "",
"Día 5 alcanzado", "Días sin dormir",
"", "Kivanç Terzi contactado",
"¡Las pastillitas son mías!", "Enfermedades contraídas",
"", "Misiones",
"Fusilador", "Nils contactado",
"Ciclos pasados admirando tu obra", "Ciclos en una nave de caza",
"Ciclos pasados escuchando a los demás", "Ciclos pasados de mal humor",
"Ciclos pasados bailando música Disco", "Ciclos pasados contando billetes",
"Ciclos pasados cocinando platillos sospechosos", "Ciclos pasados cultivando plantas alucinógenas",
"Ciclos pasados disparándole a cosas", "Ciclos pasados seduciendo compañeros.",
"Ciclos pasados con contactos del tercer tipo", "Ciclos pasados jugando al jefe",
"Ciclos pasados leyendo mapas", "Ciclos pasados acariciando al gato",
"Ciclos pasados como el malo de la película", "Ciclos viendo la vida a través de un microscopio",
"Ciclos pasados haciéndose sacar muestras de sangre", "Ciclos pasados buscando en las bajas frecuencias",
"Ciclos pasados gritando ¡Viva la Federación!", "Cuarto Episodio",
"Sanador", "Honguito",
"Pequeño jugador", "¡PILGRED ESTÁ DE VUELTA!",
"Pionero beta-tester", "Planetas detectados",
"Plantaciones", "Platos cocinados",
"Puertas reparadas", "Psicoanálista",
"Raciones consumidas", "Raciones cocinadas consumidas",
"¡Rebelión!", "Oficial de comunicaciones",
"¡Revolución!", "",
"Señalamiento de averías", "Señalamientos de incendios",
"Sistema Solar", "Defensor de la Humanidad",
"", "Ayudas psicológicas",
"Especialista en artefactos", "Adicto al café",
"Has financiado el juego. ¡Gracias!", "Tercer Episodio",
"Broadcaster", "Triple victoria",
"¡Un cafecito por favor!", "Quinto episodio",
"El viaje más largo"
),
c(
"5ème Colonne", "Aliments Congelés avalés",
"Altruiste", "Annonce Générale",
"Apprentissage Achevé", "Apprentrons",
"Approvisionneur", "Artefacts collectés",
"Aventurier de l'espace", "Aventurier légendaire",
"Bailloné", "Balayages Complets",
"Boucher", "Bouclier Plasma",
"Bourreau Des Mush", "Champignon acidulé",
"Champignon caché", "Champignon mûre",
"Champignon précoce", "Chats câlinés",
"Chercheur", "Chirurgien",
"Concepteur", "Cristallites aigus",
"Décrypteur de l'Enigme", "Dernier Commandant",
"Dernier survivant", "Eden, terre contaminée",
"Eden, terre promise", "Embarquement sur le Daedalus",
"Equipe De Chercheurs", "Equipe Technique",
"Explorateur", "Extinction d'Incendies",
"Grenadier", "Grippe de NERON",
"Héros de l'espace", "Héros légendaire",
"Homme Politique", "Hunters Abattus",
"Infecté", "Infecteur",
"Initié", "Inspections Réussies",
"Jour 10 Atteint", "Jour 15 Atteint",
"Jour 20 Atteint", "Jour 30 Atteint",
"Jour 5 Atteint", "Jours sans s'endormir",
"Jugement", "Kivanç Terzi contacté",
"Laissez manger les petits cachets !", "Maladies contractées",
"Maladies vénériennes contractées", "Missions",
"Natamiste", "Nils contacté",
"Nombre de cycles à contempler son oeuvre", "Nombre de cycles à tourner en rond",
"Nombre de cycles passés à allu...écouter les autres", "Nombre de cycles passés à bouder dans son coin.",
"Nombre de cycles passés à coiffer sa touffe.", "Nombre de cycles passés à compter ses billets",
"Nombre de cycles passés à cuisiner des plats suspects", "Nombre de cycles passés à cultiver des plantes hallucinogènes",
"Nombre de cycles passés à dégommer des trucs", "Nombre de cycles passés à draguer à tout va.",
"Nombre de cycles passés à fréquenter des 3èmes types", "Nombre de cycles passés à jouer au chef",
"Nombre de cycles passés à lire des cartes", "Nombre de cycles passés à passer son temps à caresser son chat",
"Nombre de cycles passés à phongiférer dans son coin", "Nombre de cycles passés à scruter l'infiniment petit",
"Nombre de cycles passés à se faire préléver des échantillons sanguins", "Nombre de cycles passés à surfer sur les basses fréquences",
"Nombre de cycles passés à vénérer la fédération.", "Nouvelles générations",
"Personnel Soignant", "Peti champi deviendra grand",
"Petit joueur", "PILGRED EST DE RETOUR !",
"Pionnier de Xyloph", "Planètes Détectées",
"Plantations", "Plats cuisinés",
"Portes réparées", "Psychanalyse Totale",
"Rations avalées", "Rations cuisinées avalées",
"Rébellion !", "Responsable Des Communications",
"Revolution !", "Siestes",
"Signalement de pannes", "Signalements d'Incendies",
"Sol", "Soldat de l'Humanité",
"Sommeil fongique", "Soutiens Psychologiques",
"Spécialiste en artefacts", "Squatteur De Cafetière",
"Supporter", "Sus aux parasites !",
"Télégénie", "Triple Victoire",
"Un café pour la 16 !", "Vieilles connaissances",
"Voyage Le Plus Long"
)
)
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Calculs et déclarations var =========
lang_var <- c("s", "exp_404", "exp_likes")
count <- 0
user_t <- 0
for(lang in languages){
if(file.exists(paste0(file_name, csv_list_name, "_", lang, ".csv"))){
if(file.info(paste0(file_name, csv_list_name, "_", lang, ".csv"))$size > 0){
user_t <- user_t + length(unique(read.csv(paste0(file_name, csv_list_name, "_", lang, ".csv"), encoding = "UTF-8", header = FALSE, stringsAsFactors = FALSE)[,1]))
}
}
}
goals_null <- data.frame(
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA)
names(goals_null) <- goals_table[,3]
data <- NULL
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# //////// FIN PRÉPARATION SCRIPT JOUEURS /////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# ========= BOUCLE S'EXÉCUTANT UNE FOIS PAR LANGUE ET PAR JOUEUR =========
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
message('Début du scrapping...')
for(lang in languages){
if(file.exists(paste0(file_name, csv_list_name, "_", lang, ".csv"))){
if(file.info(paste0(file_name, csv_list_name, "_", lang, ".csv"))$size > 0){
for(lang_v in lang_var){assign(lang_v, get(paste0(lang_v, "_", lang)))}
user_list <- sort(unique(read.csv(
paste0(file_name, csv_list_name, "_", lang, ".csv"),
encoding = "UTF-8", header = FALSE, stringsAsFactors = FALSE
)[,1]))
count_lines <- 0
for(user in user_list){
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Téléchargement profil, trophées, contacts =========
# --- Téléchargement et nettoyage page de profil
content <- download(
paste0("https://twinoid.com/", lang, "/user/", user)
)
content <- gsub('(\\r|\\n|\\t)', '', content)
xml <- read_xml(content, as_html = TRUE)
# --- Actualisation cookie et extraction clé
sid <- cookies(GET("https://twinoid.com"))$value
Sys.sleep(wait)
goal_chk <- stri_match_first_regex(content,
"<div class=\"tid_module tid_modinit\" ondblclick=\"return .chk:'([a-z0-9]*)',url:'/mod/userGoals/[0-9]*',infos:'og'.\">"
)[2]
contact_chk <- stri_match_first_regex(content,
"<div class=\"tid_module tid_modinit\" ondblclick=\"return .chk:'([a-z0-9]*)',url:'/mod/contactListPublic/[0-9]*',infos:'og'.\">"
)[2]
# --- Téléchargement et nettoyage modules (trophées & contacts)
goals_html <- download(
paste0(
"https://twinoid.com/mod/userGoals/", user, "?chk=", goal_chk, ";infos=og;s=", s,
";_id=tid_18;jsm=1;lang=fr;host=twinoid.com;proto=https%3A;sid=", sid
)
)
goals_html <- gsub('\\n', '\n', goals_html, fixed = TRUE)
goals_html <- gsub("\\'", "'", goals_html, fixed = TRUE)
goals_html <- gsub('(\r|\n|\t)', '', goals_html)
goals_html <- gsub('.*<div class="note"><div class="note">Note : la valeur en points de chaque gain est approximative (arrondie).</div><table>', '<table>', goals_html)
contact_html <- download(
paste0(
"https://twinoid.com/mod/contactListPublic/", user, "?chk=", contact_chk, ";infos=og;all=true;s=", s,
";_id=tid_18;jsm=1;lang=fr;host=twinoid.com;proto=https%3A;sid=", sid
)
)
contact_html <- gsub('\\n', '\n', contact_html, fixed = TRUE)
contact_html <- gsub("\\'", "'", contact_html, fixed = TRUE)
contact_html <- gsub('(\r|\n|\t)', '', contact_html)
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Infos =========
# --- Page de profil trouvée
if(stri_detect(content, regex = exp_404)){
available <- "0"
}else{
available <- "1"
}
# --- Pseudo et anciens pseudos
pseudo <- gsub('<span on(?:e|E)dit=\"window.location=\'http://twinoid.com/user/account/name\'\">([^<]+)</span>', '\\1',
stri_extract(content, regex = '<span on(?:e|E)dit=\"window.location=\'http://twinoid.com/user/account/name\'\">[^<]+</span>')
)
if(str_detect(content, '<div class="oldNames">')){
on_list <- str_extract_all(
gsub("\\n",'',toString(
xml_find_first(xml, "//div[@class='oldNames']")
)), '<span class=\"tid_tip\" title=\"[^\\"]*\">[^<]*</span>')
old_names <- ""
for(on in on_list[[1]]){
name <- gsub('<span class=\"tid_tip\" title=\"[^\\"]*\">([^<]*)</span>', '\\1', on)
date <- gsub('<span class=\"tid_tip\" title=\"[^:]*: ([^\\"]*)\">.*', '\\1', on)
date <- twino2iso(date, lang)
old_names <- paste0(old_names, ';', paste0(name, ',', date))
}
old_names <- gsub('^;', '', old_names)
}else{
old_names <- NA
}
# --- Variables démographiques (âge, genre, ville, pays)
if(available == "1"){
demo <- gsub("\\n", '',
toString(xml_find_first(read_xml(content, as_html = TRUE),
"//div[@class='tid_asv']"
))
)
}else{
demo <- NA
}
# Genre
genre <- gsub('.*alt=\\"([^"\\]*)\\".*', '\\1',
gsub('.*<div class=\\"tid_age\\"> *<img([^>]*)> *[0-9]* *[yrsañon]{2,4} *</div>.*', '\\1', demo)
)
# Âge
age <- NA
if(!is.na(demo)){
if(str_detect(demo, '<div class=\\"tid_age\\"> *<img[^>]*> *[0-9]* *[yrsañon]{2,4} *</div>')){
age <- gsub('.*<div class=\\"tid_age\\"> *<img[^>]*> *([0-9]*) *[yrsañon]{2,4} *</div>.*', '\\1', demo)
}
}
# Ville & pays
city <- html2txt(gsub(' </div>','', gsub('.*<div class=\\"tid_city\\"> ([^<]*</div>).*', '\\1', demo)))
if(!is.na(city)){if(city == '--'){city <- NA}}
country <- html2txt(gsub(' </div>','', gsub('.*<div class=\\"tid_country\\"> ([^<]*</div>).*', '\\1', demo)))
if(!is.na(country)){if(country == '--'){country <- NA}}
# --- Création data.frame infos
player <- data.frame(lang, user, available, pseudo, old_names, genre, age, city, country)
names(player) <- c("Langue", "ID", "Page trouvée", "Pseudo", "Anciens pseudos", "Genre", "Âge", "Ville", "Pays")
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Contacts et groupes =========
# --- Likes profil
likes_profile <- gsub('<[^>]*>[ ]*([0-9]*)', '\\1',
stri_extract(content, regex = '<img class=\\"tid_likeIcon\\" [^>]*>[ ]*[0-9]*')
)
if(!is.na(likes_profile)){
if(likes_profile == ""){likes_profile <- 0}
}
# --- Nombre amis et ajouté
if(stri_detect(content, regex = '<div class="tid_count">')){
contact_n_friend <- gsub('<div[^>]*>([0-9]*) (friends|amigos|amis)<span[^>]*>, ([0-9]*) (have added them|jugadores le han agregado|joueurs l\'ont ajouté).{0,3}</span>\n</div>\n', '\\1',
as.character(xml_find_first(xml, "//div[@class='tid_count']"))
)
contact_n_added <- gsub('<div[^>]*>([0-9]*) (friends|amigos|amis)<span[^>]*>, ([0-9]*) (have added them|jugadores le han agregado|joueurs l\'ont ajouté).{0,3}</span>\n</div>\n', '\\3',
as.character(xml_find_first(xml, "//div[@class='tid_count']"))
)
}else if(available == "1"){
contact_n_friend <- 0
contact_n_added <- 0
}else{
contact_n_friend <- NA
contact_n_added <- NA
}
# --- Liste amis
contacts <- stri_extract_all(contact_html, regex = "window.location='http://twinoid.com/user/[0-9]+'")[[1]]
if(is.na(contacts)){
contact_list <- NA
}else{
contact_list <- ''
for(contact in contacts){
contact <- gsub('[^0-9]*([0-9]*)[^0-9]*', '\\1', contact)
contact_list <- paste0(contact_list, ';', contact)
}
contact_list <- gsub('^;', '', contact_list)
}
# --- Liste groupes
if(str_detect(content, '<div class="narrowTab groups">')){
groups <- xml_find_all(xml, "//div[@class='group']")
group_list <- ""
for(group in groups){
group <- gsub('.*<a href="http://([^"]*)">[^<]*</a>.*', '\\1', as.character(group))
group_list <- paste0(group_list, ';' , group)
}
group_list <- gsub('^;', '', group_list)
}else{
group_list <- NA
}
# --- Création data.frame réseau
network <- data.frame(likes_profile, contact_n_friend, contact_n_added, contact_list, group_list)
names(network) <- c("Likes profil", "Nombre amis", "Nombre ajouté par", "Liste amis", "Liste groupes")
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Avatars =========
if(get_avatars && available == "1"){
dir.create(file.path(getwd(), img_dir_name), showWarnings = FALSE)
img_html <- as.character(xml_find_first(xml, "//td[@class='tid_avatar']"))
if(str_detect(img_html, '<img class="tid_avatarImg"[^>]*>')){
img_code <- -1
if(stri_detect(img_html, regex = 'src="(//imgup.motion-twin.com/[^".]*).[a-z]+"')){
img_path <- gsub('.*src="(//imgup.motion-twin.com/[^".]*).[a-z]+".*', '\\1', img_html)
img_type <- gsub('.*src="//imgup.motion-twin.com/[^".]*(.[a-z]+)".*', '\\1', img_html)
try(img_code <- GET(paste0('https:', img_path, img_type))$status_code)
}else{
img_path <- NA
img_type <- NA
}
if(img_code == 200 && !is.na(img_path) && !is.na(img_type)){
w_img <- wait
while(!file.exists(paste0(img_dir_name, '/', file_name, '_', user, img_type))){
if(w_img > wait){message(paste0('[', format(Sys.time(), "%X"), '] Échec téléchargement, nouvelle tentative dans ', w_img, 's.'))}
Sys.sleep(w_img)
try(download.file(
paste0('https:', img_path, img_type),
destfile = paste0(img_dir_name, '/', file_name, '_', user, img_type),
mode = 'wb', quiet = TRUE, cacheOK = TRUE
))
w_img <- w_img * 2
}
}else{
file.create(paste0(img_dir_name, '/', file_name, '_', user))
}
}
}
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Trophées =========
# --- Extraction de chaque ligne de trophée
goal_list <- stri_extract_all(
goals_html,
regex = '<td class=\"icon\"><img src=\"[^"]*\"/?></td><td><div class=\"name( gold)?\">[^<]*</div></td><td class=\"score\"><em>x</em>[0-9]*</td>'
)[[1]]
# --- Pour chaque ligne, extraction nom et nombre, ajout au data.frame regroupant les trophées trouvés
goals <- NULL
for(g in goal_list){
g_name <- gsub(
'<td class=\"icon\"><img src=\"[^"]*\"/?></td><td><div class=\"name(?: gold)?\">([^<]*)</div></td><td class=\"score\"><em>x</em>[0-9]*</td>',
"\\1",
g
)
g_stat <- gsub(
'<td class=\"icon\"><img src=\"[^"]*\"/?></td><td><div class=\"name(?: gold)?\">[^<]*</div></td><td class=\"score\"><em>x</em>([0-9]*)</td>',
"\\1",
g
)
goal <- data.frame(g_stat, stringsAsFactors = FALSE)
names(goal) <- g_name
if(is.null(goals)){goals <- goal}else{goals <- cbind(goals, goal)}
}
# --- Si aucun trophée, data.frame par défaut, sinon ajout des valeurs (traduites) du data.frame des trophées trouvés
if(is.na(goals[1])){
goals <- goals_null
}else{
if(lang != "fr"){
goal_name_t <- NULL
for(goals_name in names(goals)){
if(lang == "en"){
goal_nt <- as.character(goals_table[which(goals_table[,1] == goals_name),3])
}else if(lang == "es"){
goal_nt <- as.character(goals_table[which(goals_table[,2] == goals_name),3])
}
if(is.null(goal_name_t)){goal_name_t <- goal_nt}else{goal_name_t <- append(goal_name_t, goal_nt)}
}
names(goals) <- goal_name_t
}
goals <- merge.data.frame(goals_null, goals, all.y = TRUE)
goals <- goals[,order(colnames(goals))]
goals[is.na(goals)] <- 0
}
# --- Ajout Score
score <- gsub('<div class=\"overall\">([0-9]+) points?</div>', '\\1',
stri_extract(goals_html, regex = '<div class=\"overall\">[0-9]+ points?</div>')
)
score <- data.frame(score)
colnames(score) <- "Score Mush"
goals <- cbind(score, goals)
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Likes =========
if(available == "0"){
l <- NA
}else{
l <- gsub('<div class=\"name\"><span class=\"count\">([0-9]*)<em>x</em></span>[^<]*</div>', '\\1',
stri_extract(goals_html, regex = exp_likes)
)
if(is.na(l)){l <- "0"}
}
likes <- data.frame(l)
names(likes) <- "Tout le monde vous aime !"
# /////////////////////////////////////////////////////////////////////////////////
# ========= > Opérations de fin de boucle =========
# --- Date de fin du webscrapping
date <- data.frame(Sys.time())
names(date) <- "Date"
# --- Compilation des données joueur, puis ajout à la BDD globale
user_data <- cbind(player, network, goals, likes, date)
if(is.null(data)){data <- user_data}else{data <- rbind(data, user_data)}
remove(user_data)
gc(verbose = FALSE)
# --- Affichage n°
count <- count + 1
cat(paste0("Joueurs ", floor(count / user_t * 100), "% : ", lang, "-", user), fill = TRUE)
# --- Création fichier CSV puis supression données, si nombre de ligne atteint ou dernier joueur
count_lines <- count_lines +1
if(count_lines / csv_size == round(count_lines / csv_size) || user == tail(user_list, 1)){
message(paste0('Création du fichier ', file_name, csv_user_name, "_", lang, user, ".csv"))
write.csv(
data,
paste0(file_name, csv_user_name, "_", lang, user, ".csv"),
row.names = FALSE,
fileEncoding = "UTF-8",
na = ""
)
remove(data)
remove(
url, content, xml, date, available,
demo, genre, age, city, country,
on, on_list, old_names, name,
contact, contact_chk, contact_html, contact_list, contacts, contact_n_friend, contact_n_added,
groups, group, group_list,
img_html, img_path, img_type,
pseudo, likes_profile,
score, goal, goals, g, g_name, g_stat, goal_chk, goal_list, goal_name_t, goal_nt, goals_html, goals_name, likes, l,
player, network
)
gc(verbose = FALSE, full = TRUE)
data <- NULL
}
}}}}
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
# ///////// FIN BOUCLE /////////
# /////////////////////////////////////////////////////////////////////////////////
# /////////////////////////////////////////////////////////////////////////////////
}
# ################## SI SCRIPT JOUEURS ################## #
# Fin de l'exécution de la suite du script
# ####################################################### #
message('Exécution du script terminée')