lemarios/verbos-espanol-conjugaciones-software/src/main/groovy/com/acceso/lemarios/ExtracInflectedForms.groovy

103 lines
2.6 KiB
Groovy

package com.acceso.lemarios
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import org.jsoup.select.Elements
class ExtractInflectedForms {
private final static String OUTPUT_FILE = '/tmp/inflectedForms.txt'
private final static String VERB_FILE = '/com/acceso/lemarios/verbos-espanol.txt'
private final static String LEMA_SEARCH_URL = 'http://lema.rae.es/drae/srv/search?val='
private final static String INFLECTED_FORMS_URL = 'http://lema.rae.es/drae/srv/'
private final static List STOP_WORDS = ['me', 'te', 'se', 'nos', 'os', 'etc.']
private File outputFile = new File(OUTPUT_FILE)
public void execute() {
List<String> inflected = []
List<String> verbs = loadVerbs()
verbs.eachWithIndex { verb, index ->
println "[${index}, ${verbs.size()}]"
inflected.addAll(extract(verb))
}
//inflected = inflected.sort().unique()
inflected.removeAll(STOP_WORDS)
save(inflected)
}
private List<String> extract(String verb) {
List<String> results = []
List<String> inflectedUrls = getInflectedUrls(verb)
//println "[${verb}] (${inflectedUrls.size()})"
//inflectedUrls.each { println "\t${it}" }
results = inflectedUrls.collect {
try {
return getInflectedForms(verb, it)
} catch(Exception e) {
println "Error [${verb}] [${it}]: ${e}"
return []
}
}.flatten()
//println "Declinaciones " + results.join('--')
return results
}
private List<String> getInflectedForms(String verb, String inflectedUrl) {
Document doc = Jsoup.connect(inflectedUrl).get()
Elements elements = doc.select("p[class=z]")
List<String> results = elements.collect { element ->
element.text().split(',|/| o | u | ').collect{it.trim()}.findAll{it}
}.flatten()
if(!results) {
println "[0 inflected forms for ${verb}]"
}
return results
}
// Devuelve las urls con las declinaciones
private List<String> getInflectedUrls(String verb) {
List<String> result = []
try {
Document doc = Jsoup.connect(LEMA_SEARCH_URL + URLEncoder.encode(verb, 'ISO-8859-1')).get()
Elements elements = doc.select("a img[alt^=Ver conjugaci]")
result = elements.collect { element -> INFLECTED_FORMS_URL + element.parent().attr('href') }
} catch (Exception e) {
println "Error [${verb}] [${it}]: ${e}"
}
return result
}
private void save(List<String> forms) {
forms.each { outputFile << "${it}\n" }
}
private List<String> loadVerbs() {
List<String> result = []
InputStream i = getClass().getResourceAsStream(VERB_FILE)
if(i) {
i.eachLine { line -> if(line.contains('í')) result << line }
}
return result
}
public static void main(String[] args) {
ExtractInflectedForms e = new ExtractInflectedForms()
e.execute()
}
}