Μετάβαση στο περιεχόμενο

Χρήστης:Egmontaz/Interwiki

Από τη Βικιπαίδεια, την ελεύθερη εγκυκλοπαίδεια

Κώδικας για την εύρεση σελίδων χωρίς διαγλωσσικούς συνδέσμους από αρχείο XML της βάσης δεδομένω Βασίζεται στο Wikifind το οποίο διανέμεται υπό την GNU General Public License (GPL) version 3 or later http://www.gnu.org/licenses/gpl.html.


void missing_interwikis (void) {
    /* define data */
    string dummy; //dummy
    string line, title, title2;
    /* vars for redirects */
    string * redirect;
    redirect = new string [6];
    ifstream redirects("redirect.xml");
    getline (redirects, dummy);
    for (int i=0;i<6;i++) {
        getline (redirects, redirect[i]);
    }
    /* vars for namespaces */
    string * nspace;
    nspace = new string [4];
    ifstream nspaces("namespace.xml");
    getline (nspaces, dummy);
    for (int i=0;i<4;i++) {
        getline (nspaces, nspace[i]);
    }
    /* Read interwiki codes & initialize counters */
    string * iw;
    iw = new string [60];
    int iwc[60];
    ifstream interwiki_codes("iw.dat");
    for (int i=0; i <60;i++){
        getline (interwiki_codes, iw[i]);
        iw[i] ="[["+iw[i]+":";
    }
    for (int i =0;i<60;i++){
        iwc[i]=0;
    }
    /* XML */
    string pagestart ="<page>";
    string pageend = "</page>";
    string titlestart="<title>";
    string textstart="<text";
    
    string fin = "articles.xml";  /* dump file*/
    string nfout1 ="interwiki.txt"; /* output file interwikis */ 
    string nfout2 ="dabiw.txt"; /* output file disambiguations */ 
    string nfout3 ="catiw.txt"; /* output file categories */ 
    string nfout4 ="wikiw.txt"; /* output file wikipedia */ 
    string nfout5 ="temiw.txt"; /* output file templates */ 
    string nfout6 ="heliw.txt"; /* output file help */ 
   
    /* flags */
    bool hasinterwiki = false;
    bool isredirect = false;
    int articles_without = 0;
    int tems_without = 0;
    int cats_without = 0;
    int wiki_without = 0;
    int dabs_without = 0;
    int help_without = 0;
    int tot = 0;
    ofstream deb("debug.txt", ios::app);
    ofstream deb2("debug2.txt", ios::app);
    ofstream sample("sample.txt", ios::app);
    ifstream FileIn(fin.c_str()); /* Open dump */
    if (!FileIn){ /* if something goes wrong with file opening */
        cout << "File not found!" <<endl;
    }
    while (getline (FileIn, line)) {
        int foundtitle = line.find(titlestart);
        
        if (foundtitle != -1) {
            title = line;
            tot++;
            cout << tot << "\t" << articles_without << "\t" << cats_without << "\t" << tems_without << "\t" << wiki_without << "\t" << help_without << endl;
            hasinterwiki = false;
            isredirect = false;
            int foundtext = 0;
            
            do {
                getline (FileIn, line);
                foundtext = line.find(textstart);           
            } while (foundtext != -1);
            int foundpage = 0;
            int lix =0;
            do {
                getline (FileIn, line);
                lix++;
                foundpage = line.find(pageend);
                for (int i =0 ; i < 60 ; i++){
                    int iwiki =line.find(iw[i]);
                    if (iwiki!=-1) {
                        hasinterwiki=true;   
                    }
                }
                for (int i=0; i<6 ;i++) {
                    int rdc = line.find(redirect[i]);
                    if (rdc!=-1) {
                        isredirect=true;
                    }
                }
            } while (foundpage == -1);
           
            if ((hasinterwiki==false)&&(isredirect==false)) {
                int isarticle = title.find(":");
                if (isarticle==-1) {
                    articles_without++;
				    ofstream fout1(nfout1.c_str(), ios::app);
                    int langd = title.length() - 19; //removingt xml- taggs
				    int i = 11;
                    fout1 << "|-\n";
				    fout1 << "|" << articles_without << "|| [[:";  //wikiformating
 
				    while (langd > 0) //printing pagename
				    {
					   fout1 << title[i]; 
					   i = i + 1;
					   langd = langd - 1;
 
				    }						
 
				    fout1 << "]]\n"; //wikiformating

                }
                int iscat=title.find(nspace[0]);
                if (iscat!=-1) {
                    cats_without++;
				    ofstream fout3(nfout3.c_str(), ios::app);
                    int langd = title.length() - 19; //removingt xml- taggs
				    int i = 11;
                    fout3 << "|-\n";
				    fout3 << "|" << cats_without << "|| [[:";  //wikiformating
 
				    while (langd > 0) //printing pagename
				    {
					   fout3 << title[i]; 
					   i = i + 1;
					   langd = langd - 1;
 
				    }						
 
				    fout3 << "]]\n"; //wikiformating

                }
                int istem=title.find(nspace[1]);
                if (istem!=-1) {
                    tems_without++;
				    ofstream fout5(nfout5.c_str(), ios::app);
                    int langd = title.length() - 19; //removingt xml- taggs
				    int i = 11;
                    fout5 << "|-\n";
				    fout5 << "|" << tems_without << "|| [[:";  //wikiformating
 
				    while (langd > 0) //printing pagename
				    {
					   fout5 << title[i]; 
					   i = i + 1;
					   langd = langd - 1;
 
				    }						
 
				    fout5 << "]]\n"; //wikiformating

                }
                int iswiki=title.find(nspace[2]);
                int ispg = title.find("/");
                if ((iswiki!=-1)&&(ispg==-1)) {
                    wiki_without++;
				    ofstream fout4(nfout4.c_str(), ios::app);
                    int langd = title.length() - 19; //removingt xml- taggs
				    int i = 11;
                    fout4 << "|-\n";
				    fout4 << "|" << wiki_without << "|| [[:";  //wikiformating
 
				    while (langd > 0) //printing pagename
				    {
					   fout4 << title[i]; 
					   i = i + 1;
					   langd = langd - 1;
 
				    }						
 
				    fout4 << "]]\n"; //wikiformating

                }
                int ishelp=title.find(nspace[3]);
                if (ishelp!=-1) {
                    help_without++;
				    ofstream fout6(nfout6.c_str(), ios::app);
                    int langd = title.length() - 19; //removingt xml- taggs
				    int i = 11;
                    fout6 << "|-\n";
				    fout6 << "|" << help_without << "|| [[:";  //wikiformating
 
				    while (langd > 0) //printing pagename
				    {
					   fout6 << title[i]; 
					   i = i + 1;
					   langd = langd - 1;
 
				    }						
 
				    fout6 << "]]\n"; //wikiformating

                }
            }
        }
    }
}