%%
% (C) Denes Szucs; 2016
% Please note that this code is provided as it is without any support.

%% INITIALIZE pdftoolbox
clear
clear java
javaaddpath('D:\matlab\pdfparse\PDFBox-0.7.3\lib\PDFBox-0.7.3.jar')
javaaddpath('D:\matlab\pdfparse\FontBox-0.1.0\lib\FontBox-0.1.0.jar')
%
pdfdoc = org.pdfbox.pdmodel.PDDocument;
reader = org.pdfbox.util.PDFTextStripper;


%%
% START SEARCH and read data from PDF folder

% {'t(' >> these are GENERAL psychology STYLE ; 't' >> Nature style
FindWhat     = { 'F(' 't(' };
FindExtender = { 'F'  't'  };
StrLength = 65; % length of string to be stored/parsed

cd 'D:\DATA\PDFsForAnalysis\pdfs'
Folder  = 'D:\DATA\PDFsForAnalysis\pdfs\';

% folders with pdf files
SubDirs = {'Psychological Science'  'Cognitive Psychology' 'Cognition' 'Acta Psychologica' 'JECP'  ...
           'Nature Neuroscience' 'Neuron' 'Brain' 'The Journal of Neuroscience' 'Cerebral Cortex'  ...
           'Neuroimage' 'Cortex' 'Biological Psychology' 'Neuropsychologia' 'Neuroscience' ...
           'Biological Psychiatry' 'J Psychiatric Research' 'Neurobiology of Aging' };

% initialize
clear Files FIndx TIndx BIndx RIndx ErrFiles FDir TDir BDir RDir
Fstr = repmat('@',[2*1E5 StrLength+1]); 
Fst1 = repmat('@',[2*1E5 StrLength+1]); 
Tstr = repmat('@',[2*1E5 StrLength+1]); 
FDir = repmat(0,[2*1E5 1]); FIndx = repmat(0,[2*1E5 1]);
TDir = repmat(0,[2*1E5 1]); TIndx = repmat(0,[2*1E5 1]);
Dirs = [];
Fnum = 1; Tnum = 1;
Errnum = 0; Encrynum = 1; SuccessFiles = 0;
% d=1 ; i=1; fi = 1; s = 1; % For testing
tic
for d = 1:length(SubDirs)
    DirList = dir(strcat(Folder,char(SubDirs(d)),'\','*.pdf'));        
    for i = 1:length(DirList)
        Dirs(d).FileNames(i) = cellstr(DirList(i).name);
        fprintf('%u%s%u%s%s%s%s\n',i,'/',...
                 length(DirList),' : File: ',char(SubDirs(d)) ,'  :  ', DirList(i).name);
        pdfdoc = pdfdoc.load(strcat(Folder,char(SubDirs(d)),'\',DirList(i).name));
        if pdfdoc.isEncrypted
            pdfdoc.getDocument().close;
            fprintf('%s%s\n','  ++++ > Encripted file  > ', DirList(i).name);
            EncryptedFiles(Encrynum).Name = DirList(i).name;
            Encrynum = Encrynum + 1;
        else
            try
                pdfstr = reader.getText(pdfdoc); % critical point; this gives java error for some files: TRY/CATCH
                pdfdoc.getDocument().close;
                pdfstr = deblank(char(pdfstr));

                SuccessFiles = SuccessFiles + 1;
                SuccessFileSeries(SuccessFiles) = SuccessFiles;
                %Files(SuccessFiles).Name = DirList(SuccessFiles).name;

                for fi = 1:length(FindWhat)
                    Where1 = strfind(pdfstr,char(FindWhat(fi)) ); % protected finds; F( , t(
                    Where2 = strfind(pdfstr,char(FindExtender(fi)) ); % extender: F , t
                    if ~isempty(Where1) | ~isempty(Where2)
                        % get rid of doubles in 2 relative to 1
                        % F( F(  F(
                        Doubles = find(ismember(Where2, Where1));
                        Where2(Doubles) = []; 
                        % get rid of too close ones protecting type 1
                        IdM = [repmat(1,[1 length(Where1)])  repmat(2,[1 length(Where2)])];
                        [WhereAll WInd] = sort([Where1 Where2]);
                        IdM = IdM(WInd);
                        WhereTooClose = 1;
                        while ~isempty(WhereTooClose)
                            WhereTooClose = find(diff(WhereAll)<20)+1;  % diff: y(2) - y(1); etc...
                            Protected = find(IdM(WhereTooClose)==1);
                            % next line is not good cos while N. line is protected, may delete N-1 which may also be protected!!
                            %WhereTooClose(Protected) = WhereTooClose(Protected)-1; % change to the one before                            
                            WhereTooClose(Protected) = [];
                            WhereAll(WhereTooClose)=[];
                        end
                        % check whether we have enough space after the last found items; if not, remove them
                        WhereAll(find((WhereAll+StrLength) > length(pdfstr))) = [];
                        
                        % analyze each string
                        for s = 1:length(WhereAll)
                            FoundStr = pdfstr(WhereAll(s):WhereAll(s)+StrLength);
                            FoundStr1 = FoundStr; % save orignal line for error checking if needed
                            % remove Enters, etc (just for dispay check)
                            FoundStr(FoundStr<14)=32;
                            % change strange occas. STRANGE signs in some journals to = sign : CORTEX
                            % it's good to do all this here to avoid huge array size
                            % BUT: beware of removing important stuff; e.g. p< 0.05
                            % can test if enabling  HelpFlag2 = 1;
                            
                            FoundStr(strfind(FoundStr,'ź')) = '='; % = in Cortex BUT > in Neuropsychologia! * ie. p value errors in Neuropsychologia must be double checked
                            
                            FoundStr = strrep(FoundStr,'H11005','=     '); % The Journal of Neuroscience; this comes at t(df)= & p=
                            FoundStr = strrep(FoundStr,'H11021','<     '); % The Journal of Neuroscience; this comes at p<
                            FoundStr = strrep(FoundStr,'H11022','>     '); % The Journal of Neuroscience; this comest at p > 0.05
                            FoundStr = strrep(FoundStr,'H11002','     -'); % The Journal of Neuroscience: minus
                            FoundStr = strrep(FoundStr,'H11006','     ą'); % The Journal of Neuroscience: plus/minus
                            FoundStr = strrep(FoundStr,'H9257  2'   ,'ETA2    '); % The Journal of Neuroscience
                            FoundStr = strrep(FoundStr,'H9257  p  2','ETAPAr2    '); % The Journal of Neuroscience
                            FoundStr = strrep(FoundStr,'H11003','     *'); % The Journal of Neuroscience: *
                            FoundStr = strrep(FoundStr,'C17','ETA');       % BRAIN
                            FoundStr = strrep(FoundStr,'p40','p=0');       % BRAIN
                            FoundStr = strrep(FoundStr,'p50','p=0');       % BRAIN
                            FoundStr = strrep(FoundStr,'= C0','=  -');     % BRAIN
                            FoundStr = strrep(FoundStr,'=C0','= -');       % BRAIN
                            FoundStr = strrep(FoundStr,'P5','p<');         % BRAIN; p<
                            FoundStr = strrep(FoundStr,'P  5','P  <');     % BRAIN; p<                            
                            FoundStr = strrep(FoundStr,'P4','p>');         % BRAIN; p>
                            
                            % remove spaces for checking 'p='
                            FoundStrNOSPACE = FoundStr;
                            FoundStrNOSPACE(FoundStrNOSPACE==32)=[]; % remove spaces for analysis
                            
                            % only analyze text involving = < > signs and 'p' char; this is likely to have p value                            
                            % this is necessary here otherwise F/Tstr are enormous
                            if  ( ...
                                (length(strfind(FoundStr(1:end),'='))>1)       | ...   % min TWO = signs
                                ( (length(strfind(FoundStr(1:end),'<'))>0) &     ...   % min 1 = AND 1 <
                                  (length(strfind(FoundStr(1:end),'='))>0)   ) | ...
                                ( (length(strfind(FoundStr(1:end),'>'))>0) &     ...   % min 1 = AND 1 > (for n.s.)
                                  (length(strfind(FoundStr(1:end),'='))>0)   )   ...
                                ) ...
                                & ...
                                ( ...
                                (~isempty(strfind(lower(FoundStrNOSPACE),'p='))) | ... % AND there is a p= ; this restricts very well for T TESTS, less relevant for F tests (there 'p' is OK)
                                (~isempty(strfind(lower(FoundStrNOSPACE),'p>'))) | ... % AND there is a p> ; this restricts very well for T TESTS, less relevant for F tests (there 'p' is OK)
                                (~isempty(strfind(lower(FoundStrNOSPACE),'p<'))) | ... % AND there is a p< ; this restricts very well for T TESTS, less relevant for F tests (there 'p' is OK)
                                (~isempty(strfind(lower(FoundStrNOSPACE),'n.s.'))) |  ...
                                (~isempty(strfind(lower(FoundStrNOSPACE),'ns')))      ...
                                 ) ; 
                                HelpFlag2 = 1;
                            else 
                                HelpFlag2 = 0;
                            end

                            %HelpFlag2 = 1; % if this is active; there's no =<>p check; all strings go thru
                            if fi==3 % 
                                HelpFlag2 = 1; % no filtering otherwise nothing will be found!
                            end
                            
                            % analysis runs if any of the above are true
                            if HelpFlag2==1
                                switch fi
                                    case 1 % F test
                                        Fstr(Fnum,:) = FoundStr;
                                        Fst1(Fnum,:) = FoundStr1;
                                        FDir(Fnum,1)   = d; % directory number
                                        FIndx(Fnum,1) = i;
                                        Fnum = Fnum + 1;
                                    case 2 % t test
                                        Tstr(Tnum,:) = FoundStr;
                                        TDir(Tnum,1)   = d; % directory number
                                        TIndx(Tnum,1) = i;
                                        Tnum = Tnum + 1;
                                end % switch
                            end % end of analysis if HeLPFLAG
                        end % for s
                    end % ~isempty
                end % fi
            catch MEsg
                pdfdoc.getDocument().close;
                Errnum = Errnum + 1;                
                fprintf('%s%s\n','  **** > FILE ERROR  > ', DirList(i).name);
                disp(MEsg.message(1:56));
                fprintf('%s%s\n','  **** > Error message truncated here < ', DirList(i).name);
                ErrFiles(Errnum).Name = DirList(i).name;
                ErrFiles(Errnum).Mesg = MEsg;
            end% try
        end % not encrypted
    end
    ParsedInEachFolder(d)  = length(DirList); % Total number of parsed files in each folder
    SuccessInEachFolder(d) = SuccessFiles; % No of successfully parsed files in each folder (consec)
end
toc
% Elapsed time is 3118.371983 seconds / 60 =  51.97 mins for ALL


%% Initial summary statistics

SuccessInEachFolder = [SuccessInEachFolder(1) diff(SuccessInEachFolder)];
DirInfoVAR.Folder = Folder;
DirInfoVAR.SubDirs = SubDirs;
DirInfoVAR.SuccessInEachFolder = SuccessInEachFolder;
DirInfoVAR.ParsedInEachFolder = ParsedInEachFolder;
DirInfoVAR.SuccessFiles = SuccessFiles;
DirInfoVAR.ParsedALL = sum(ParsedInEachFolder)

% delete extra length matrix data
Fstr(Fnum:end,:) = [];
Fst1(Fnum:end,:) = [];

Tstr(Tnum:end,:) = [];
FDir(Fnum:end,:) = [];
TDir(Tnum:end,:) = [];
% added: 03 Dec 2015
FIndx(Fnum:end,:) = [];
TIndx(Tnum:end,:) = [];

% stats
fprintf('\n%s%u\n','Files parsed: ',sum(ParsedInEachFolder));
fprintf('%s%u\n','Successful reads: ',SuccessFiles);
fprintf('%s\n','Successful (top) and total (bottom) reads in each folder and success %:');
disp(SubDirs)
disp(SuccessInEachFolder)
disp(ParsedInEachFolder)
disp(round((SuccessInEachFolder ./ ParsedInEachFolder)*100))
fprintf('%s%u\n%s%u\n%s%u\n','Initial F values (potential) = ',size(Fstr,1) , ...
                       'Initial T values (potential) = ',size(Tstr,1) , ...
                       'Initial r values (potential) = ',size(Rstr,1)  )

% ** SAVE POINT 1
% save 'RAW F,T,B,R data for ALL'  Tstr Fstr Bstr Rstr   FDir TDir BDir RDir   DirInfoVAR Dirs   FIndx TIndx BIndx RIndx
% check?
% uint16(Fst1(2,1:20))


%% Standardize formats for easier checking and making sure results are correct

% Fstr standardization: it is clearer to do here than in analysis part
% bring all to the standard reporting format, easier to check outcome: F (1,23) = xx
for i = 1:size(Fstr,1)
    a = Fstr(i,:);
    % correct strings with no parentheses
    [Fstart,Fend] = regexpi(a,'F\s*\d*,\d*\s*=');  % e.g. >> 'F  2,14   ='
    if ~isempty(Fstart)
        Fnd = a(Fstart(1):Fend(1));
        Fnd = strrep(Fnd,'F ','F(');
        Fnd = strrep(Fnd,' =',')=');
        Fstr(i,Fstart(1):Fend(1)) = Fnd;
    end
end
% The following pass only makes visual check easier but not necessary for analysis
FDelFlag = zeros([size(Fstr,1) 1]);
for i = 1:size(Fstr,1)
    a = Fstr(i,:); 
    a(a==32)=[];  % remove spaces to simplify search
    % strsplit(a,{'',''})
    % regexpi(a,'\d*','match') % all nums
    DF1 = regexpi(a,'F(\d*','match');
    if isempty(DF1)
        FDelFlag(i) = 1;
    end
end
for i = 1:size(Fstr,1)
    fprintf('%u %s %s %s \t%u \n', i,' : ',Fstr(i,:), '  <<>>   Del:',FDelFlag(i))
end
fprintf('%s%u%s%u\n','Initial Fstr size: ',size(Fstr,1),' ; To Delete:',length(find(FDelFlag)) );
% delete non-standard entries
Fstr(find(FDelFlag),:) = [];
FIndx(find(FDelFlag),:) = [];
FDir(find(FDelFlag),:) = [];
FDelFlag(find(FDelFlag),:) = [];
%FDelFlag = 0;


% ***
% Tstr standardization
% bring all to the standard reporting format, easier to check outcome: F (1,23) = xx
for i = 1:size(Tstr,1)
    a = Tstr(i,:);
    % correct strings with no parentheses
    [Tstart,Tend] = regexpi(a,'t\s*\d*\s*=');  % e.g. >> 'F  2,14   ='
    %regexpi(a,'t\s*\d*\s*=','match')
    if ~isempty(Tstart)
        Fnd = a(Tstart(1):Tend(1));
        Fnd = strrep(Fnd,'t ','t(');
        Fnd = strrep(Fnd,' =',')=');
        Tstr(i,Tstart(1):Tend(1)) = Fnd;
    end
end
% The following pass only makes visual checks easier but not necessary for analysis
TDelFlag = zeros([size(Tstr,1) 1]);
for i = 1:size(Tstr,1)
    a = Tstr(i,:); 
    a(a==32)=[];  % remove spaces to simplify search
    % strsplit(a,{'',''})
    % regexpi(a,'\d*','match') % all nums
    DF = regexpi(a,'t[(\d*.d*)]+=','match'); % parentheses are command characters for finding groups/tokens!!
    if ~isempty(DF)
        if isempty(sscanf(char(DF(1))','t(%f')) 
            TDelFlag(i) = 1;
        end
    else
        TDelFlag(i) = 1;
    end
end
for i = 1:size(Tstr,1)
    fprintf('%u %s %s %s %u \n', i,' : ',Tstr(i,:), '  <<>>   Del: ',TDelFlag(i))
end
fprintf('%s%u%s%u\n','Initial Tstr size: ',size(Tstr,1),' ; To Delete:',length(find(TDelFlag)) );
% delete non-standard entries
Tstr(find(TDelFlag),:) = [];
TIndx(find(TDelFlag),:) = [];
TDir(find(TDelFlag),:) = [];
TDelFlag(find(TDelFlag),:) = [];
%TDelFlag = 0;