본문 바로가기
코딩 연습/코딩배우기

웹 페이지 <script> 태그 CDATA, 넌 뭐하는 넘이니...

by good4me 2022. 1. 15.

goodthings4me.tistory.com

웹 페이지 내 <script> 태그 내용 중 //<![CDATA로 시작하는 데이터를 봤을 때 무슨 암호로 된 것인 줄 알았다. 이 부분을 잘 살펴보니 '<', '>', '/' 특수 문자와 숫자로 되어있는데 태그를 풀어쓴 것 같은 내용이었고, 이 부분이 궁금하여 유튜브와 구글에서 찾아보던 중에 이에 대한 스크래핑(크롤링) 영상이 있어서 정리해보았다.

 

파이썬 스크래핑(크롤링)으로 CDATA 추출하는 방법

 

CDATA를 쓰는 이유는,

웹 브라우저의 구문 분석(XML Parser) 문제 때문에 사용하기도 하고, 

웹 사이트에서 데이터를 추출할 때, 과도하게 추출하는 것을 막기 위한 조치, 즉 스크래핑(크롤링) 못하도록 여러 장치를 두는 한 방법으로도 사용하는 것이라고 한다.  

 

CDATA 있는 웹 페이지(샘플)

'''
<div class="col-md-6">
    <ul class="iconlist">
        <li>
            <i class="icon-phone"> </i>+855 (0)23 21 59 60
        </li>
        <li><i class="icon-mail"> </i>
            <script type="text/javascript">
            //<![CDATA[
            var l=new Array();
            l[0]='>';l[1]='a';l[2]='/';l[3]='<';l[4]='|109';l[5]='|111';l[6]='|99';l[7]='|46';l[8]='|101';l[9]='|109';l[10]='|105';l[11]='|108';l[12]='|99';l[13]='|99';l[14]='|97';l[15]='|64';l[16]='|97';l[17]='|110';l[18]='|103';l[19]='|97';l[20]='|112';l[21]='|107';l[22]='|97';l[23]='|110';l[24]='|97';l[25]='|116';l[26]='|116';l[27]='|97';l[28]='|118';l[29]='|46';l[30]='|99';l[31]='>';l[32]='"';l[33]='|109';l[34]='|111';l[35]='|99';l[36]='|46';l[37]='|101';l[38]='|109';l[39]='|105';l[40]='|108';l[41]='|99';l[42]='|99';l[43]='|97';l[44]='|64';l[45]='|97';l[46]='|110';l[47]='|103';l[48]='|97';l[49]='|112';l[50]='|107';l[51]='|97';l[52]='|110';l[53]='|97';l[54]='|116';l[55]='|116';l[56]='|97';l[57]='|118';l[58]='|46';l[59]='|99';l[60]=':';l[61]='o';l[62]='t';l[63]='l';l[64]='i';l[65]='a';l[66]='m';l[67]='"';l[68]='=';l[69]='f';l[70]='e';l[71]='r';l[72]='h';l[73]=' ';l[74]='a';l[75]='<';
            for (var i = l.length-1; i >= 0; i=i-1){
            if (l[i].substring(0, 1) == '|') document.write("&#"+unescape(l[i].substring(1))+";");
            else document.write(unescape(l[i]));}
            //]]>
            </script>
            <a href="mailto:c.vattanakpagna@acclime.com">c.vattanakpagna@acclime.com</a>
        </li>
        <li><i class="icon-globe"></i> 
            <a href="https://cambodia.acclime.com/" target="_blank">
            <i style="background-color:#2C3E50"></i>https://cambodia.acclime.com/
            </a>
        </li>

    </ul>
</div>
'''

good4me.co.kr


크롤링 해보기

from requests_html import HTMLSession
import re

url = 'https://www.eurocham-cambodia.org/member/555/Acclime-Cambodia'

session = HTMLSession()
r1 = session.get(url)
# print(r1.text)

email_text = r1.html.find('ul.iconlist li')[1].text  # li 태그 3개 중 index 1이 이메일 주소임
print(email_text)

'''
//<![CDATA[ var l=new Array(); l[0]='>';l[1]='a';l[2]='/';l[3]='<';l[4]='|109';l[5]='|111';l[6]\
    ='|99';l[7]='|46';l[8]='|101';l[9]='|109';l[10]='|105';l[11]='|108';l[12]='|99';l[13]='|99';l[14]\
    ='|97';l[15]='|64';l[16]='|97';l[17]='|110';l[18]='|103';l[19]='|97';l[20]='|112';l[21]='|107';l[22]\
    ='|97';l[23]='|110';l[24]='|97';l[25]='|116';l[26]='|116';l[27]='|97';l[28]='|118';l[29]='|46';l[30]\
    ='|99';l[31]='>';l[32]='"';l[33]='|109';l[34]='|111';l[35]='|99';l[36]='|46';l[37]='|101';l[38]\
    ='|109';l[39]='|105';l[40]='|108';l[41]='|99';l[42]='|99';l[43]='|97';l[44]='|64';l[45]='|97';l[46]\
    ='|110';l[47]='|103';l[48]='|97';l[49]='|112';l[50]='|107';l[51]='|97';l[52]='|110';l[53]='|97';l[54]\
    ='|116';l[55]='|116';l[56]='|97';l[57]='|118';l[58]='|46';l[59]='|99';l[60]=':';l[61]='o';l[62]='t';l[63]\
    ='l';l[64]='i';l[65]='a';l[66]='m';l[67]='"';l[68]='=';l[69]='f';l[70]='e';l[71]='r';l[72]='h';l[73]=' ';l[74]\
    ='a';l[75]='<'; for (var i = l.length-1; i >= 0; i=i-1){ if (l[i].substring(0, 1) == '|') \
    document.write("&#"+unescape(l[i].substring(1))+";"); else document.write(unescape(l[i]));} //]]>  
'''

- 해당 부분을 https://www.regextester.com/ 복사해서 정규표현식으로 분석한 후,

 

chars = re.findall(r"'(.*?)'", email_text)
print(chars)

'''
['>', 'a', '/', '<', '|109', '|111', '|99', '|46', '|101', '|109', '|105', '|108', '|99', '|99', '|97', '|64', '|97', '|110', '|103', '|97', '|112', '|107', '|97', '|110', '|97', '|116', '|116', '|97', '|118', '|46', '|99', '>', '"', '|109', '|111', '|99', '|46', '|101', '|109', '|105', '|108', '|99', '|99', '|97', '|64', '|97', '|110', '|103', '|97', '|112', '|107', '|97', '|110', '|97', '|116', '|116', '|97', '|118', '|46', '|99', ':', 'o', 't', 'l', 'i', 'a', 'm', '"', '=', 'f', 'e', 'r', 'h', ' ', 'a', '<', '|']
'''

 

- for 문으로 확인,

for char in chars:
    if char[0] == '|':
        # print(char[1:])
        print(chr(int(char[1:])))  # chr() 사용
    else:
        print(char)
        
'''
print(chr(int(char[1:])))
ValueError: invalid literal for int() with base 10: ''
'''

- 마지막에 오류 메시지가 나온다. --> 예외 처리 필요

 

DECIMAL to CHAR

 

output = []
for char in chars:
    try:
        if char[0] == '|':
            output.append(chr(int(char[1:])))  # chr() 사용
        else:
            output.append(char)
    except ValueError as e:
        print(f'Error: {e}')  # Error: invalid literal for int() with base 10: ''

'''
['>', 'a', '/', '<', '|109', '|111', '|99', '|46', '|101', '|109', '|105', '|108', '|99', '|99', '|97', '|64', '|97', '|110', '|103', '|97', '|112', '|107', '|97', '|110', '|97', '|116', '|116', '|97', '|118', '|46', '|99', '>', '"', '|109', '|111', '|99', '|46', '|101', '|109', '|105', '|108', '|99', '|99', '|97', '|64', '|97', '|110', '|103', '|97', '|112', '|107', '|97', '|110', '|97', '|116', '|116', '|97', '|118', '|46', '|99', ':', 'o', 't', 'l', 'i', 'a', 'm', '"', '=', 'f', 'e', 'r', 'h', ' ', 'a', '<', '|']
Error: invalid literal for int() with base 10: ''
'''

print(output)

'''
['>', 'a', '/', '<', 'm', 'o', 'c', '.', 'e', 'm', 'i', 'l', 'c', 'c', 'a', '@', 'a', 'n', 'g', 'a', 'p', 'k', 'a', 'n', 'a', 't', 't', 'a', 'v', 
'.', 'c', '>', '"', 'm', 'o', 'c', '.', 'e', 'm', 'i', 'l', 'c', 'c', 'a', '@', 'a', 'n', 'g', 'a', 'p', 'k', 'a', 'n', 'a', 't', 't', 'a', 'v', '.', 'c', ':', 'o', 't', 'l', 'i', 'a', 'm', '"', '=', 'f', 'e', 'r', 'h', ' ', 'a', '<']
'''

print(list(reversed(output)))

'''
['<', 'a', ' ', 'h', 'r', 'e', 'f', '=', '"', 'm', 'a', 'i', 'l', 't', 'o', ':', 'c', '.', 'v', 'a', 't', 't', 'a', 'n', 'a', 'k', 'p', 'a', 'g', 
'n', 'a', '@', 'a', 'c', 'c', 'l', 'i', 'm', 'e', '.', 'c', 'o', 'm', '"', '>', 'c', '.', 'v', 'a', 't', 't', 'a', 'n', 'a', 'k', 'p', 'a', 'g', 'n', 'a', '@', 'a', 'c', 'c', 'l', 'i', 'm', 'e', '.', 'c', 'o', 'm', '<', '/', 'a', '>']
'''

print(''.join(list(reversed(output))))

'''
<a href="mailto:c.vattanakpagna@acclime.com">c.vattanakpagna@acclime.com</a>
'''

 

[출처] 참고 영상 : HIDING Data with JavaScript? Web Scraping Obfuscation

 

댓글